Spaces:

Kalpokoch
/

OpenQuery

Sleeping

App Files Files Community

Kalpokoch commited on Aug 8

Commit

d1af85f

1 Parent(s): 6b9a057

sentence transformer added

Browse files

Files changed (2) hide show

app.py +65 -64
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -3,98 +3,99 @@ from fastapi.middleware.cors import CORSMiddleware
 import uuid
 import logging
 import io
-# NEW: Import parsing libraries
-import fitz  # PyMuPDF
 from PIL import Image
 import pytesseract
-# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = FastAPI()
-# CORS Middleware
 app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
 )
 # In-memory session store
 SESSION_DATA = {}
-logger.info("Session store initialized.")
-# --- NEW: Parsing Functions ---
-def parse_pdf(content: bytes) -> str:
-    """Extracts text from a PDF file's bytes."""
-    try:
-        doc = fitz.open(stream=content, filetype="pdf")
-        text = ""
-        for page in doc:
-            text += page.get_text()
-        logger.info(f"Successfully parsed PDF, extracted {len(text)} characters.")
-        return text
-    except Exception as e:
-        logger.error(f"PDF parsing failed: {e}")
-        return ""
-def parse_image(content: bytes) -> str:
-    """Extracts text from an image file's bytes using OCR."""
-    try:
-        image = Image.open(io.BytesIO(content))
-        text = pytesseract.image_to_string(image)
-        logger.info(f"Successfully parsed image, extracted {len(text)} characters.")
-        return text
-    except Exception as e:
-        logger.error(f"Image parsing failed: {e}")
-        return ""
 # --- MODIFIED: The /upload Endpoint ---
 @app.post("/upload")
 async def upload_file(file: UploadFile = File(...)):
-    """
-    Accepts a file, detects its type, parses it, and stores the extracted text.
-    """
     session_id = str(uuid.uuid4())
     logger.info(f"New upload '{file.filename}'. Creating session_id: {session_id}")
     content = await file.read()
     extracted_text = ""
-    # Simple dispatcher based on file's content type
-    if file.content_type == "application/pdf":
-        extracted_text = parse_pdf(content)
-    elif file.content_type and file.content_type.startswith("image/"):
-        extracted_text = parse_image(content)
-    elif file.content_type == "text/plain":
-        extracted_text = content.decode("utf-8")
-    else:
-        raise HTTPException(status_code=400, detail=f"Unsupported file type: {file.content_type}")
-    if not extracted_text:
         raise HTTPException(status_code=400, detail="Could not extract any text from the file.")
-    # Store the EXTRACTED TEXT, not the raw file content
     SESSION_DATA[session_id] = {
         "filename": file.filename,
-        "text": extracted_text
     }
     return {
         "session_id": session_id,
         "filename": file.filename,
-        "chars_extracted": len(extracted_text)
-    }
-# This endpoint is useful for debugging
-@app.get("/session/{session_id}/text")
-def get_session_text(session_id: str):
-    if session_id not in SESSION_DATA:
-        raise HTTPException(status_code=404, detail="Session not found.")
-    return {"text": SESSION_DATA[session_id].get("text", "")}

 import uuid
 import logging
 import io
+import fitz
 from PIL import Image
 import pytesseract
+import numpy as np
+# NEW: Import AI and search libraries
+from sentence_transformers import SentenceTransformer
+import faiss
+# --- Basic Setup (Logging, FastAPI, CORS) ---
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = FastAPI()
 app.add_middleware(
+    CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
 )
+# --- AI MODEL LOADING ---
+# This happens only once when the app starts.
+# 'all-MiniLM-L6-v2' is a great, lightweight model for CPU.
+try:
+    logger.info("Loading sentence-transformer model...")
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    logger.info("Model loaded successfully.")
+except Exception as e:
+    logger.error(f"Failed to load sentence-transformer model: {e}")
+    model = None
 # In-memory session store
 SESSION_DATA = {}
+# --- Parsing Functions (parse_pdf, parse_image - keep these as they are) ---
+def parse_pdf(content: bytes) -> str: # ... your existing function ...
+def parse_image(content: bytes) -> str: # ... your existing function ...
+# --- NEW: Helper function for chunking text ---
+def chunk_text(text: str, chunk_size: int = 256, overlap: int = 32) -> list[str]:
+    """Splits text into overlapping chunks of words."""
+    words = text.split()
+    if not words:
+        return []
+    chunks = []
+    for i in range(0, len(words), chunk_size - overlap):
+        chunk = " ".join(words[i:i + chunk_size])
+        chunks.append(chunk)
+    return chunks
 # --- MODIFIED: The /upload Endpoint ---
 @app.post("/upload")
 async def upload_file(file: UploadFile = File(...)):
+    if not model:
+        raise HTTPException(status_code=503, detail="AI model is not available.")
     session_id = str(uuid.uuid4())
     logger.info(f"New upload '{file.filename}'. Creating session_id: {session_id}")
     content = await file.read()
+    # 1. PARSE (This part is the same as before)
     extracted_text = ""
+    if file.content_type == "application/pdf": extracted_text = parse_pdf(content)
+    elif file.content_type and file.content_type.startswith("image/"): extracted_text = parse_image(content)
+    elif file.content_type == "text/plain": extracted_text = content.decode("utf-8")
+    else: raise HTTPException(status_code=400, detail=f"Unsupported file type: {file.content_type}")
+    if not extracted_text.strip():
         raise HTTPException(status_code=400, detail="Could not extract any text from the file.")
+    # 2. CHUNK
+    text_chunks = chunk_text(extracted_text)
+    logger.info(f"Text chunked into {len(text_chunks)} pieces.")
+    if not text_chunks:
+        raise HTTPException(status_code=400, detail="Document is empty or too short to be chunked.")
+    # 3. EMBED
+    logger.info("Generating embeddings for text chunks...")
+    embeddings = model.encode(text_chunks, convert_to_numpy=True)
+    logger.info(f"Embeddings generated with shape: {embeddings.shape}")
+    # 4. INDEX
+    d = embeddings.shape[1]  # Dimension of embeddings
+    index = faiss.IndexFlatL2(d)
+    index.add(embeddings.astype('float32')) # FAISS requires float32
+    logger.info(f"FAISS index created with {index.ntotal} vectors.")
+    # Store the index AND the original text chunks in the session
     SESSION_DATA[session_id] = {
         "filename": file.filename,
+        "chunks": text_chunks,
+        "index": index.serialize()  # Serialize the index for storage
     }
     return {
         "session_id": session_id,
         "filename": file.filename,
+        "chunks_created": len(text_chunks)
+    }

requirements.txt CHANGED Viewed

@@ -4,4 +4,7 @@ python-multipart
 PyMuPDF
 Pillow
-pytesseract

 PyMuPDF
 Pillow
+pytesseract
+sentence-transformers
+faiss-cpu