Spaces:

NavyDevilDoc
/

Semantic_Search

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 17, 2025

Commit

5b8cf94

verified ·

1 Parent(s): 6f1c390

Update src/parsers.py

Browse files

Files changed (1) hide show

src/parsers.py +57 -24

src/parsers.py CHANGED Viewed

@@ -3,45 +3,78 @@ import docx
 import pandas as pd
 from pdf2image import convert_from_bytes
 import pytesseract
-import io
-def parse_file(uploaded_file):
-    filename = uploaded_file.name
     text = ""
     try:
         if filename.endswith(".pdf"):
             reader = pypdf.PdfReader(uploaded_file)
-            def parse_file(uploaded_file):
-                for i, page in enumerate(reader.pages):
-                    extracted = page.extract_text()
-                    if extracted:
-                        text += f"\n[PAGE {i+1}] {extracted}"
-                if len(text.strip()) < 50:
-                    method = "OCR (Slow)"
-                    images = convert_from_bytes(pdf_bytes)
-                    text = ""
-                    for i, img in enumerate(images):
-                        page_text = pytesseract.image_to_string(img)
-                        text += f"\n[PAGE {i+1}] {page_text}"
         elif filename.endswith(".docx"):
             doc = docx.Document(uploaded_file)
-            text = "\n".join([p.text for p in doc.paragraphs])
         elif filename.endswith(".csv"):
-            # NEW: CSV Handling
             df = pd.read_csv(uploaded_file)
-            text = df.to_string(index=False) # Flattens table to string
         elif filename.endswith(".xlsx") or filename.endswith(".xls"):
-            # NEW: Excel Handling
             df = pd.read_excel(uploaded_file)
             text = df.to_string(index=False)
-    except Exception as e:
-        return None, f"Error: {e}"
-    return text, "Success"

 import pandas as pd
 from pdf2image import convert_from_bytes
 import pytesseract
+import uuid
+def process_file(uploaded_file):
+    """
+    Input: Streamlit UploadedFile
+    Output: (full_text, filename, method)
+    """
     text = ""
+    filename = uploaded_file.name
+    method = "Fast Text"
     try:
+        # 1. PDF Handling
         if filename.endswith(".pdf"):
+            pdf_bytes = uploaded_file.getvalue()
             reader = pypdf.PdfReader(uploaded_file)
+            for i, page in enumerate(reader.pages):
+                extracted = page.extract_text()
+                if extracted:
+                    text += f"\n[PAGE {i+1}] {extracted}"
+            # OCR Fallback
+            if len(text.strip()) < 50:
+                method = "OCR (Slow)"
+                images = convert_from_bytes(pdf_bytes)
+                text = ""
+                for i, img in enumerate(images):
+                    page_text = pytesseract.image_to_string(img)
+                    text += f"\n[PAGE {i+1}] {page_text}"
+        # 2. Word Handling
         elif filename.endswith(".docx"):
             doc = docx.Document(uploaded_file)
+            text = "\n".join([para.text for para in doc.paragraphs])
+        # 3. Excel/CSV Handling (NEW)
         elif filename.endswith(".csv"):
             df = pd.read_csv(uploaded_file)
+            text = df.to_string(index=False)
+            method = "Table Parse"
         elif filename.endswith(".xlsx") or filename.endswith(".xls"):
             df = pd.read_excel(uploaded_file)
             text = df.to_string(index=False)
+            method = "Table Parse"
+        # 4. Plain Text
+        elif filename.endswith(".txt"):
+            text = uploaded_file.read().decode("utf-8")
+    except Exception as e:
+        return "", filename, f"Error: {str(e)}"
+    return text, filename, method
+def chunk_text(text, source, chunk_size=500, overlap=100):
+    """
+    Generates chunks AND assigns a unique doc_id to link them together.
+    """
+    words = text.split()
+    chunks = []
+    doc_id = str(uuid.uuid4()) # Generate ID once per document
+    for i in range(0, len(words), chunk_size - overlap):
+        chunk_text = " ".join(words[i:i + chunk_size])
+        if len(chunk_text) > 20: # Minimal filter
+            chunks.append({
+                "text": chunk_text,
+                "source": source,
+                "doc_id": doc_id,
+                "chunk_id": str(uuid.uuid4())
+            })
+    return chunks, doc_id