Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from typing import Optional | |
| import PyPDF2 | |
| from docx import Document | |
| import io | |
| def extract_from_url(url: str) -> str: | |
| """Extract text content from URL (synchronous)""" | |
| try: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Remove script and style elements | |
| for script in soup(["script", "style", "nav", "footer", "header"]): | |
| script.decompose() | |
| # Get text | |
| text = soup.get_text(separator=' ', strip=True) | |
| # Clean up whitespace | |
| lines = (line.strip() for line in text.splitlines()) | |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
| text = ' '.join(chunk for chunk in chunks if chunk) | |
| return text | |
| except Exception as e: | |
| print(f"Error extracting from URL: {e}") | |
| raise Exception(f"Failed to extract text from URL: {str(e)}") | |
| def extract_from_document(content: bytes, file_extension: str) -> str: | |
| """Extract text from document (synchronous)""" | |
| try: | |
| if file_extension == ".pdf": | |
| return _extract_from_pdf(content) | |
| elif file_extension == ".docx": | |
| return _extract_from_docx(content) | |
| elif file_extension == ".txt": | |
| return content.decode('utf-8') | |
| else: | |
| raise ValueError(f"Unsupported file type: {file_extension}") | |
| except Exception as e: | |
| print(f"Error extracting from document: {e}") | |
| raise Exception(f"Failed to extract text from document: {str(e)}") | |
| def _extract_from_pdf(content: bytes) -> str: | |
| """Extract text from PDF""" | |
| try: | |
| pdf_file = io.BytesIO(content) | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text.strip() | |
| except Exception as e: | |
| raise Exception(f"Error reading PDF: {str(e)}") | |
| def _extract_from_docx(content: bytes) -> str: | |
| """Extract text from DOCX""" | |
| try: | |
| doc_file = io.BytesIO(content) | |
| doc = Document(doc_file) | |
| text = "" | |
| for paragraph in doc.paragraphs: | |
| text += paragraph.text + "\n" | |
| return text.strip() | |
| except Exception as e: | |
| raise Exception(f"Error reading DOCX: {str(e)}") |