Spaces:
Runtime error
Runtime error
| import fitz | |
| from bs4 import BeautifulSoup | |
| import aiohttp | |
| from pathlib import Path | |
| from typing import Optional | |
| class DocumentProcessor: | |
| async def extract_pdf_text(file_path: Path) -> str: | |
| doc = fitz.open(file_path) | |
| text_parts = [] | |
| for page in doc: | |
| text = page.get_text() | |
| text_parts.append(text) | |
| doc.close() | |
| return "\n\n".join(text_parts) | |
| async def scrape_url(url: str) -> tuple[str, str]: | |
| async with aiohttp.ClientSession() as session: | |
| async with session.get(str(url)) as response: | |
| html = await response.text() | |
| soup = BeautifulSoup(html, 'html.parser') | |
| for script in soup(["script", "style", "nav", "footer", "header"]): | |
| script.decompose() | |
| title = soup.find('title') | |
| title_text = title.get_text().strip() if title else "Web Document" | |
| text = soup.get_text(separator='\n', strip=True) | |
| lines = [line.strip() for line in text.splitlines() if line.strip()] | |
| cleaned_text = '\n'.join(lines) | |
| return title_text, cleaned_text | |
| def validate_file_type(filename: str, allowed_extensions: set = {'.pdf'}) -> bool: | |
| return Path(filename).suffix.lower() in allowed_extensions | |
| def clean_text(text: str) -> str: | |
| lines = text.split('\n') | |
| cleaned_lines = [] | |
| for line in lines: | |
| line = line.strip() | |
| if len(line) > 0: | |
| cleaned_lines.append(line) | |
| return '\n'.join(cleaned_lines) | |