hackrx / processor.py
asdfghcwrej6u's picture
Upload 6 files
7de37f7 verified
import fitz # PyMuPDF for PDFs
import docx
import requests
from bs4 import BeautifulSoup
from tempfile import NamedTemporaryFile
def extract_text_from_url(file_url: str) -> str:
response = requests.get(file_url)
ext = file_url.split('?')[0].split('.')[-1].lower()
with NamedTemporaryFile(delete=False, suffix=f".{ext}") as f:
f.write(response.content)
f.flush()
if ext == "pdf":
doc = fitz.open(f.name)
return "\n".join([page.get_text() for page in doc])
elif ext == "docx":
doc = docx.Document(f.name)
return "\n".join([p.text for p in doc.paragraphs])
elif ext == "eml":
with open(f.name, "r", encoding="utf-8", errors="ignore") as email_file:
html = email_file.read()
soup = BeautifulSoup(html, "html.parser")
return soup.get_text(separator="\n")
else:
return "❌ Unsupported file format"
def chunk_text(text: str, chunk_size: int = 200, overlap: int = 50) -> list:
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunks.append(" ".join(words[i:i + chunk_size]))
return chunks