Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import time | |
| def read_file(file_path): | |
| """ | |
| Read text content from a file. | |
| Args: | |
| file_path (str): Path to the text file | |
| Returns: | |
| str: File content | |
| """ | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| content = f.read().strip() | |
| if not content: | |
| raise Exception("File is empty") | |
| return content | |
| except UnicodeDecodeError: | |
| # Try with different encodings if utf-8 fails | |
| try: | |
| with open(file_path, 'r', encoding='latin-1') as f: | |
| content = f.read().strip() | |
| if not content: | |
| raise Exception("File is empty") | |
| return content | |
| except Exception as e: | |
| raise Exception(f"Failed to read file with alternative encoding: {str(e)}") | |
| except Exception as e: | |
| raise Exception(f"File reading failed: {str(e)}") | |
| def extract_from_url(url): | |
| """ | |
| Extract text content from a URL. | |
| Args: | |
| url (str): URL to extract text from | |
| Returns: | |
| str: Extracted text content | |
| """ | |
| try: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| # Add retry mechanism | |
| max_retries = 3 | |
| for attempt in range(max_retries): | |
| try: | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| break | |
| except requests.RequestException as e: | |
| if attempt == max_retries - 1: | |
| raise | |
| time.sleep(1) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Try to get text from articles first | |
| article_text = "" | |
| articles = soup.find_all(['article', 'main']) | |
| if articles: | |
| for article in articles: | |
| paragraphs = article.find_all("p") | |
| article_text += " ".join(p.text.strip() for p in paragraphs if p.text.strip()) | |
| # If no article text found, fall back to all paragraphs | |
| if not article_text: | |
| paragraphs = soup.find_all("p") | |
| article_text = " ".join(p.text.strip() for p in paragraphs if p.text.strip()) | |
| if not article_text: | |
| raise Exception("No text content found on the page") | |
| return article_text | |
| except requests.RequestException as e: | |
| raise Exception(f"Failed to fetch URL: {str(e)}") | |
| except Exception as e: | |
| raise Exception(f"URL extraction failed: {str(e)}") |