Spaces:
Running
Running
| import os | |
| import docx | |
| import pdfplumber | |
| import nltk | |
| nltk.download("punkt") | |
| from nltk.tokenize import sent_tokenize | |
| def extract_text_from_docx(path): | |
| try: | |
| doc = docx.Document(path) | |
| paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()] | |
| return paragraphs if paragraphs else [] | |
| except Exception as e: | |
| print(f"β Failed to extract DOCX: {e}") | |
| return [] | |
| def extract_text_from_pdf(path): | |
| try: | |
| with pdfplumber.open(path) as pdf: | |
| all_text = "\n".join( | |
| page.extract_text() for page in pdf.pages if page.extract_text() | |
| ) | |
| except Exception as e: | |
| print(f"β Failed to extract PDF: {e}") | |
| return [] | |
| if not all_text.strip(): | |
| return [] | |
| # Try splitting by paragraphs | |
| paragraphs = [p.strip() for p in all_text.split("\n\n") if p.strip()] | |
| if paragraphs: | |
| return paragraphs | |
| # Fallback: break into 3β5 sentence chunks | |
| sentences = sent_tokenize(all_text) | |
| return [" ".join(sentences[i:i + 5]) for i in range(0, len(sentences), 5)] | |
| def extract_paragraphs(path): | |
| ext = os.path.splitext(path)[-1].lower() | |
| if ext == ".docx": | |
| return extract_text_from_docx(path) | |
| elif ext == ".pdf": | |
| return extract_text_from_pdf(path) | |
| else: | |
| raise ValueError(f"Unsupported file type: {ext}") | |