Spaces:
Running
Running
File size: 1,232 Bytes
7de37f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import fitz # PyMuPDF for PDFs
import docx
import requests
from bs4 import BeautifulSoup
from tempfile import NamedTemporaryFile
def extract_text_from_url(file_url: str) -> str:
response = requests.get(file_url)
ext = file_url.split('?')[0].split('.')[-1].lower()
with NamedTemporaryFile(delete=False, suffix=f".{ext}") as f:
f.write(response.content)
f.flush()
if ext == "pdf":
doc = fitz.open(f.name)
return "\n".join([page.get_text() for page in doc])
elif ext == "docx":
doc = docx.Document(f.name)
return "\n".join([p.text for p in doc.paragraphs])
elif ext == "eml":
with open(f.name, "r", encoding="utf-8", errors="ignore") as email_file:
html = email_file.read()
soup = BeautifulSoup(html, "html.parser")
return soup.get_text(separator="\n")
else:
return "❌ Unsupported file format"
def chunk_text(text: str, chunk_size: int = 200, overlap: int = 50) -> list:
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunks.append(" ".join(words[i:i + chunk_size]))
return chunks
|