File size: 1,232 Bytes
7de37f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import fitz  # PyMuPDF for PDFs
import docx
import requests
from bs4 import BeautifulSoup
from tempfile import NamedTemporaryFile

def extract_text_from_url(file_url: str) -> str:
    response = requests.get(file_url)
    ext = file_url.split('?')[0].split('.')[-1].lower()

    with NamedTemporaryFile(delete=False, suffix=f".{ext}") as f:
        f.write(response.content)
        f.flush()

        if ext == "pdf":
            doc = fitz.open(f.name)
            return "\n".join([page.get_text() for page in doc])

        elif ext == "docx":
            doc = docx.Document(f.name)
            return "\n".join([p.text for p in doc.paragraphs])

        elif ext == "eml":
            with open(f.name, "r", encoding="utf-8", errors="ignore") as email_file:
                html = email_file.read()
                soup = BeautifulSoup(html, "html.parser")
                return soup.get_text(separator="\n")

        else:
            return "❌ Unsupported file format"

def chunk_text(text: str, chunk_size: int = 200, overlap: int = 50) -> list:
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunks.append(" ".join(words[i:i + chunk_size]))
    return chunks