import requests from bs4 import BeautifulSoup from typing import Optional import PyPDF2 from docx import Document import io def extract_from_url(url: str) -> str: """Extract text content from URL (synchronous)""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Remove script and style elements for script in soup(["script", "style", "nav", "footer", "header"]): script.decompose() # Get text text = soup.get_text(separator=' ', strip=True) # Clean up whitespace lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = ' '.join(chunk for chunk in chunks if chunk) return text except Exception as e: print(f"Error extracting from URL: {e}") raise Exception(f"Failed to extract text from URL: {str(e)}") def extract_from_document(content: bytes, file_extension: str) -> str: """Extract text from document (synchronous)""" try: if file_extension == ".pdf": return _extract_from_pdf(content) elif file_extension == ".docx": return _extract_from_docx(content) elif file_extension == ".txt": return content.decode('utf-8') else: raise ValueError(f"Unsupported file type: {file_extension}") except Exception as e: print(f"Error extracting from document: {e}") raise Exception(f"Failed to extract text from document: {str(e)}") def _extract_from_pdf(content: bytes) -> str: """Extract text from PDF""" try: pdf_file = io.BytesIO(content) pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" return text.strip() except Exception as e: raise Exception(f"Error reading PDF: {str(e)}") def _extract_from_docx(content: bytes) -> str: """Extract text from DOCX""" try: doc_file = io.BytesIO(content) doc = Document(doc_file) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text.strip() except Exception as e: raise Exception(f"Error reading DOCX: {str(e)}")