File size: 1,300 Bytes
a561338 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
import tempfile
from pdfminer.high_level import extract_text as extract_pdf_text
from docx import Document as DocxDocument
from app.config import ALLOWED_EXTENSIONS
def allowed_file(filename: str) -> bool:
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
def extract_text_from_file(content_bytes: bytes, filename: str, max_words: int = 500) -> str:
ext = filename.rsplit(".", 1)[1].lower()
text = ""
try:
if ext == "txt":
text = content_bytes.decode("utf-8", errors="ignore")
elif ext == "pdf":
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
tmp.write(content_bytes)
tmp.flush()
text = extract_pdf_text(tmp.name)
elif ext == "docx":
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
tmp.write(content_bytes)
tmp.flush()
doc = DocxDocument(tmp.name)
text = "\n".join([p.text for p in doc.paragraphs])
except Exception:
text = ""
# Word count check
word_count = len(text.split())
if word_count > max_words:
raise ValueError(f"File exceeds {max_words} words (found {word_count}).")
return text
|