Hateshield-bn / services /text_extractor.py
sgAtdbd's picture
Initial deployment of HateShield backend
8ad9255
raw
history blame
2.61 kB
import requests
from bs4 import BeautifulSoup
from typing import Optional
import PyPDF2
from docx import Document
import io
def extract_from_url(url: str) -> str:
"""Extract text content from URL (synchronous)"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style", "nav", "footer", "header"]):
script.decompose()
# Get text
text = soup.get_text(separator=' ', strip=True)
# Clean up whitespace
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
print(f"Error extracting from URL: {e}")
raise Exception(f"Failed to extract text from URL: {str(e)}")
def extract_from_document(content: bytes, file_extension: str) -> str:
"""Extract text from document (synchronous)"""
try:
if file_extension == ".pdf":
return _extract_from_pdf(content)
elif file_extension == ".docx":
return _extract_from_docx(content)
elif file_extension == ".txt":
return content.decode('utf-8')
else:
raise ValueError(f"Unsupported file type: {file_extension}")
except Exception as e:
print(f"Error extracting from document: {e}")
raise Exception(f"Failed to extract text from document: {str(e)}")
def _extract_from_pdf(content: bytes) -> str:
"""Extract text from PDF"""
try:
pdf_file = io.BytesIO(content)
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text.strip()
except Exception as e:
raise Exception(f"Error reading PDF: {str(e)}")
def _extract_from_docx(content: bytes) -> str:
"""Extract text from DOCX"""
try:
doc_file = io.BytesIO(content)
doc = Document(doc_file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text.strip()
except Exception as e:
raise Exception(f"Error reading DOCX: {str(e)}")