Spaces:

sgAtdbd
/

Hateshield-bn

Sleeping

App Files Files Community

Hateshield-bn / services /text_extractor.py

sgAtdbd

Initial deployment of HateShield backend

8ad9255 about 1 month ago

raw

history blame

2.61 kB

	import requests
	from bs4 import BeautifulSoup
	from typing import Optional
	import PyPDF2
	from docx import Document
	import io

	def extract_from_url(url: str) -> str:
	"""Extract text content from URL (synchronous)"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	}
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style", "nav", "footer", "header"]):
	script.decompose()

	# Get text
	text = soup.get_text(separator=' ', strip=True)

	# Clean up whitespace
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = ' '.join(chunk for chunk in chunks if chunk)

	return text
	except Exception as e:
	print(f"Error extracting from URL: {e}")
	raise Exception(f"Failed to extract text from URL: {str(e)}")

	def extract_from_document(content: bytes, file_extension: str) -> str:
	"""Extract text from document (synchronous)"""
	try:
	if file_extension == ".pdf":
	return _extract_from_pdf(content)
	elif file_extension == ".docx":
	return _extract_from_docx(content)
	elif file_extension == ".txt":
	return content.decode('utf-8')
	else:
	raise ValueError(f"Unsupported file type: {file_extension}")
	except Exception as e:
	print(f"Error extracting from document: {e}")
	raise Exception(f"Failed to extract text from document: {str(e)}")

	def _extract_from_pdf(content: bytes) -> str:
	"""Extract text from PDF"""
	try:
	pdf_file = io.BytesIO(content)
	pdf_reader = PyPDF2.PdfReader(pdf_file)

	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"

	return text.strip()
	except Exception as e:
	raise Exception(f"Error reading PDF: {str(e)}")

	def _extract_from_docx(content: bytes) -> str:
	"""Extract text from DOCX"""
	try:
	doc_file = io.BytesIO(content)
	doc = Document(doc_file)

	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"

	return text.strip()
	except Exception as e:
	raise Exception(f"Error reading DOCX: {str(e)}")