import pypdf import docx import pandas as pd from pdf2image import convert_from_bytes import pytesseract import uuid def process_file(uploaded_file): """ Input: Streamlit UploadedFile Output: (full_text, filename, method) """ text = "" filename = uploaded_file.name method = "Fast Text" try: # 1. PDF Handling if filename.endswith(".pdf"): pdf_bytes = uploaded_file.getvalue() reader = pypdf.PdfReader(uploaded_file) for i, page in enumerate(reader.pages): extracted = page.extract_text() if extracted: text += f"\n[PAGE {i+1}] {extracted}" # OCR Fallback if len(text.strip()) < 50: method = "OCR (Slow)" images = convert_from_bytes(pdf_bytes) text = "" for i, img in enumerate(images): page_text = pytesseract.image_to_string(img) text += f"\n[PAGE {i+1}] {page_text}" # 2. Word Handling elif filename.endswith(".docx"): doc = docx.Document(uploaded_file) text = "\n".join([para.text for para in doc.paragraphs]) # 3. Excel/CSV Handling (NEW) elif filename.endswith(".csv"): df = pd.read_csv(uploaded_file) text = df.to_string(index=False) method = "Table Parse" elif filename.endswith(".xlsx") or filename.endswith(".xls"): df = pd.read_excel(uploaded_file) text = df.to_string(index=False) method = "Table Parse" # 4. Plain Text elif filename.endswith(".txt"): text = uploaded_file.read().decode("utf-8") except Exception as e: return "", filename, f"Error: {str(e)}" return text, filename, method def chunk_text(text, source, chunk_size=500, overlap=100): """ Generates chunks AND assigns a unique doc_id to link them together. """ words = text.split() chunks = [] doc_id = str(uuid.uuid4()) # Generate ID once per document for i in range(0, len(words), chunk_size - overlap): chunk_text = " ".join(words[i:i + chunk_size]) if len(chunk_text) > 20: # Minimal filter chunks.append({ "text": chunk_text, "source": source, "doc_id": doc_id, "chunk_id": str(uuid.uuid4()) }) return chunks, doc_id