Spaces:
Sleeping
Sleeping
File size: 2,606 Bytes
8ad9255 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import requests
from bs4 import BeautifulSoup
from typing import Optional
import PyPDF2
from docx import Document
import io
def extract_from_url(url: str) -> str:
"""Extract text content from URL (synchronous)"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style", "nav", "footer", "header"]):
script.decompose()
# Get text
text = soup.get_text(separator=' ', strip=True)
# Clean up whitespace
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
print(f"Error extracting from URL: {e}")
raise Exception(f"Failed to extract text from URL: {str(e)}")
def extract_from_document(content: bytes, file_extension: str) -> str:
"""Extract text from document (synchronous)"""
try:
if file_extension == ".pdf":
return _extract_from_pdf(content)
elif file_extension == ".docx":
return _extract_from_docx(content)
elif file_extension == ".txt":
return content.decode('utf-8')
else:
raise ValueError(f"Unsupported file type: {file_extension}")
except Exception as e:
print(f"Error extracting from document: {e}")
raise Exception(f"Failed to extract text from document: {str(e)}")
def _extract_from_pdf(content: bytes) -> str:
"""Extract text from PDF"""
try:
pdf_file = io.BytesIO(content)
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text.strip()
except Exception as e:
raise Exception(f"Error reading PDF: {str(e)}")
def _extract_from_docx(content: bytes) -> str:
"""Extract text from DOCX"""
try:
doc_file = io.BytesIO(content)
doc = Document(doc_file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text.strip()
except Exception as e:
raise Exception(f"Error reading DOCX: {str(e)}") |