Spaces:

NavyDevilDoc
/

Semantic_Search

Sleeping

File size: 2,562 Bytes

0d13a08
 
 
 
 
5b8cf94
0d13a08
5b8cf94
 
 
 
 
0d13a08
5b8cf94
 
0d13a08
 
5b8cf94
0d13a08
5b8cf94
0d13a08
 
5b8cf94
 
 
 
 
 
 
 
 
 
 
 
 
0d13a08
5b8cf94
0d13a08
 
5b8cf94
 
 
0d13a08
 
5b8cf94
 
0d13a08
 
 
 
5b8cf94
 
 
 
 
0d13a08
5b8cf94
 
0d13a08
5b8cf94

import pypdf
import docx
import pandas as pd
from pdf2image import convert_from_bytes
import pytesseract
import uuid

def process_file(uploaded_file):
    """
    Input: Streamlit UploadedFile
    Output: (full_text, filename, method)
    """
    text = ""
    filename = uploaded_file.name
    method = "Fast Text"
    
    try:
        # 1. PDF Handling
        if filename.endswith(".pdf"):
            pdf_bytes = uploaded_file.getvalue()
            reader = pypdf.PdfReader(uploaded_file)
            
            for i, page in enumerate(reader.pages):
                extracted = page.extract_text()
                if extracted: 
                    text += f"\n[PAGE {i+1}] {extracted}"
            
            # OCR Fallback
            if len(text.strip()) < 50: 
                method = "OCR (Slow)"
                images = convert_from_bytes(pdf_bytes)
                text = "" 
                for i, img in enumerate(images):
                    page_text = pytesseract.image_to_string(img)
                    text += f"\n[PAGE {i+1}] {page_text}"

        # 2. Word Handling
        elif filename.endswith(".docx"):
            doc = docx.Document(uploaded_file)
            text = "\n".join([para.text for para in doc.paragraphs])

        # 3. Excel/CSV Handling (NEW)
        elif filename.endswith(".csv"):
            df = pd.read_csv(uploaded_file)
            text = df.to_string(index=False)
            method = "Table Parse"
            
        elif filename.endswith(".xlsx") or filename.endswith(".xls"):
            df = pd.read_excel(uploaded_file)
            text = df.to_string(index=False)
            method = "Table Parse"

        # 4. Plain Text
        elif filename.endswith(".txt"):
            text = uploaded_file.read().decode("utf-8")
            
    except Exception as e: 
        return "", filename, f"Error: {str(e)}"
        
    return text, filename, method

def chunk_text(text, source, chunk_size=500, overlap=100):
    """
    Generates chunks AND assigns a unique doc_id to link them together.
    """
    words = text.split()
    chunks = []
    doc_id = str(uuid.uuid4()) # Generate ID once per document
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk_text = " ".join(words[i:i + chunk_size])
        if len(chunk_text) > 20: # Minimal filter
            chunks.append({
                "text": chunk_text, 
                "source": source,
                "doc_id": doc_id,
                "chunk_id": str(uuid.uuid4())
            })
    
    return chunks, doc_id