Spaces:

Kalpokoch
/

OpenQuery

Sleeping

App Files Files Community

Kalpokoch commited on Aug 8

Commit

01eba2b

1 Parent(s): e7409fa

updated app.py

Browse files

Files changed (1) hide show

app.py +32 -43

app.py CHANGED Viewed

@@ -1,19 +1,15 @@
 #
 # ---------------- Universal Data AI ----------------
 #
-# Final app.py script
-# Combines:
-# 1. File Upload & Parsing (PDF, Image, Text)
-# 2. Text Chunking
-# 3. Vector Embedding & FAISS Indexing
-# 4. A Query Endpoint for Question Answering
 #
 # Last updated: August 8, 2025
 #
 import logging
 import uuid
-import io
 # FastAPI & Pydantic
 from fastapi import FastAPI, UploadFile, File, HTTPException
@@ -33,33 +29,26 @@ from transformers import pipeline
 # --- 1. INITIAL SETUP & MODEL LOADING ---
-# Configure logging to see outputs in Hugging Face Space logs
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Initialize FastAPI app
 app = FastAPI(
     title="Universal Data AI",
     description="Ephemeral data analysis tool with in-memory vector search.",
-    version="1.0.0",
 )
-# Add CORS middleware to allow frontend requests
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Allow all for simplicity, can be restricted to your frontend URL
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-# Load AI models on startup
-# This can take a moment when the app first boots.
 try:
     logger.info("Loading AI models...")
-    # Model for creating vector embeddings
     embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-    # Pipeline for question-answering
     qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
     logger.info("AI models loaded successfully.")
 except Exception as e:
@@ -67,23 +56,19 @@ except Exception as e:
     embedding_model = None
     qa_pipeline = None
-# In-memory dictionary to act as our temporary session database
 SESSION_DATA = {}
 # --- 2. DATA MODELS ---
 class QueryRequest(BaseModel):
-    """Defines the request body for the /query endpoint."""
     question: str
 class UploadResponse(BaseModel):
-    """Defines the response for a successful file upload."""
     session_id: str
     filename: str
     chunks_created: int
 class QueryResponse(BaseModel):
-    """Defines the response for a successful query."""
     answer: str
     score: float
     context: str
@@ -91,36 +76,40 @@ class QueryResponse(BaseModel):
 # --- 3. HELPER FUNCTIONS ---
 def parse_pdf(content: bytes) -> str:
-    """Extracts text from PDF bytes."""
     doc = fitz.open(stream=content, filetype="pdf")
-    text = "".join(page.get_text() for page in doc)
-    return text
 def parse_image(content: bytes) -> str:
-    """Extracts text from image bytes using OCR."""
     image = Image.open(io.BytesIO(content))
     return pytesseract.image_to_string(image)
 def chunk_text(text: str, chunk_size: int = 256, overlap: int = 32) -> list[str]:
-    """Splits text into overlapping chunks of words."""
     words = text.split()
     if not words: return []
     return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size - overlap)]
 def deserialize_index(serialized_index: bytes) -> faiss.Index:
-    """Loads a FAISS index from its byte representation."""
-    return faiss.read_index(faiss.VectorReader(serialized_index))
 # --- 4. API ENDPOINTS ---
 @app.get("/")
 def read_root():
-    """Root endpoint for health checks."""
     return {"status": "ok", "message": "Welcome to Universal Data AI"}
 @app.post("/upload", response_model=UploadResponse)
 async def upload_file(file: UploadFile = File(...)):
-    """Handles file upload, parsing, and AI indexing."""
     if not embedding_model:
         raise HTTPException(status_code=503, detail="AI models are not available.")
@@ -128,7 +117,6 @@ async def upload_file(file: UploadFile = File(...)):
     logger.info(f"Upload received for session {session_id}: {file.filename}")
     content = await file.read()
-    # Step 1: Parse content based on file type
     content_type = file.content_type
     if content_type == "application/pdf": text = parse_pdf(content)
     elif content_type and content_type.startswith("image/"): text = parse_image(content)
@@ -138,22 +126,28 @@ async def upload_file(file: UploadFile = File(...)):
     if not text.strip():
         raise HTTPException(status_code=400, detail="No text could be extracted from the file.")
-    # Step 2: Chunk the text
     text_chunks = chunk_text(text)
     if not text_chunks:
         raise HTTPException(status_code=400, detail="Document too short to be processed.")
-    # Step 3: Generate embeddings
     embeddings = embedding_model.encode(text_chunks, convert_to_numpy=True).astype('float32')
-    # Step 4: Create and store FAISS index
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(embeddings)
     SESSION_DATA[session_id] = {
-        "filename": file.filename,
         "chunks": text_chunks,
-        "index": faiss.write_index_buf(index), # Store the index as bytes
     }
     logger.info(f"Session {session_id} created with {len(text_chunks)} chunks.")
@@ -161,32 +155,27 @@ async def upload_file(file: UploadFile = File(...)):
 @app.post("/query/{session_id}", response_model=QueryResponse)
 async def query_session(session_id: str, request: QueryRequest):
-    """Answers a question based on the indexed content of a session."""
     if not qa_pipeline or not embedding_model:
         raise HTTPException(status_code=503, detail="AI models are not available.")
-    # Step 1: Retrieve session data
     session = SESSION_DATA.get(session_id)
     if not session:
         raise HTTPException(status_code=404, detail="Session not found.")
-    # Step 2: Find relevant context using vector search
-    question_embedding = embedding_model.encode([request.question]).astype('float32')
     index = deserialize_index(session["index"])
-    # Search for the top 3 most relevant chunks
     k = min(3, index.ntotal)
     distances, indices = index.search(question_embedding, k)
     relevant_chunks = [session["chunks"][i] for i in indices[0]]
     context = " ".join(relevant_chunks)
-    # Step 3: Use the QA model to find the answer within the context
     result = qa_pipeline(question=request.question, context=context)
     logger.info(f"Query for session {session_id} answered with score: {result['score']:.4f}")
     return {
         "answer": result["answer"],
         "score": result["score"],
-        "context": context
     }

 #
 # ---------------- Universal Data AI ----------------
 #
+# Final app.py script (v3) with robust FAISS I/O
+# Corrects previous serialization errors.
 #
 # Last updated: August 8, 2025
 #
 import logging
 import uuid
+import io  # Ensure io is imported
 # FastAPI & Pydantic
 from fastapi import FastAPI, UploadFile, File, HTTPException
 # --- 1. INITIAL SETUP & MODEL LOADING ---
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = FastAPI(
     title="Universal Data AI",
     description="Ephemeral data analysis tool with in-memory vector search.",
+    version="1.0.1", # Version bump
 )
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
 try:
     logger.info("Loading AI models...")
     embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
     qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
     logger.info("AI models loaded successfully.")
 except Exception as e:
     embedding_model = None
     qa_pipeline = None
 SESSION_DATA = {}
 # --- 2. DATA MODELS ---
 class QueryRequest(BaseModel):
     question: str
 class UploadResponse(BaseModel):
     session_id: str
     filename: str
     chunks_created: int
 class QueryResponse(BaseModel):
     answer: str
     score: float
     context: str
 # --- 3. HELPER FUNCTIONS ---
 def parse_pdf(content: bytes) -> str:
     doc = fitz.open(stream=content, filetype="pdf")
+    return "".join(page.get_text() for page in doc)
 def parse_image(content: bytes) -> str:
     image = Image.open(io.BytesIO(content))
     return pytesseract.image_to_string(image)
 def chunk_text(text: str, chunk_size: int = 256, overlap: int = 32) -> list[str]:
     words = text.split()
     if not words: return []
     return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size - overlap)]
+# --- THIS FUNCTION IS CORRECTED ---
 def deserialize_index(serialized_index: bytes) -> faiss.Index:
+    """
+    Loads a FAISS index from its byte representation using a robust method.
+    """
+    try:
+        bio = io.BytesIO(serialized_index)
+        # Use PyCallbackIOReader to read from the in-memory binary stream
+        reader = faiss.PyCallbackIOReader(bio.read)
+        return faiss.read_index(reader)
+    except Exception as e:
+        logger.error(f"Failed to deserialize FAISS index: {e}")
+        raise
 # --- 4. API ENDPOINTS ---
 @app.get("/")
 def read_root():
     return {"status": "ok", "message": "Welcome to Universal Data AI"}
 @app.post("/upload", response_model=UploadResponse)
 async def upload_file(file: UploadFile = File(...)):
     if not embedding_model:
         raise HTTPException(status_code=503, detail="AI models are not available.")
     logger.info(f"Upload received for session {session_id}: {file.filename}")
     content = await file.read()
     content_type = file.content_type
     if content_type == "application/pdf": text = parse_pdf(content)
     elif content_type and content_type.startswith("image/"): text = parse_image(content)
     if not text.strip():
         raise HTTPException(status_code=400, detail="No text could be extracted from the file.")
     text_chunks = chunk_text(text)
     if not text_chunks:
         raise HTTPException(status_code=400, detail="Document too short to be processed.")
     embeddings = embedding_model.encode(text_chunks, convert_to_numpy=True).astype('float32')
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(embeddings)
+    # --- THIS SECTION IS CORRECTED ---
+    try:
+        # Use PyCallbackIOWriter to write the index to an in-memory binary stream
+        bio = io.BytesIO()
+        writer = faiss.PyCallbackIOWriter(bio.write)
+        faiss.write_index(index, writer)
+        serialized_index = bio.getvalue()
+    except Exception as e:
+        logger.error(f"Failed to serialize FAISS index: {e}")
+        raise HTTPException(status_code=500, detail="Failed to create document index.")
     SESSION_DATA[session_id] = {
         "chunks": text_chunks,
+        "index": serialized_index, # Store the index as bytes
     }
     logger.info(f"Session {session_id} created with {len(text_chunks)} chunks.")
 @app.post("/query/{session_id}", response_model=QueryResponse)
 async def query_session(session_id: str, request: QueryRequest):
     if not qa_pipeline or not embedding_model:
         raise HTTPException(status_code=503, detail="AI models are not available.")
     session = SESSION_DATA.get(session_id)
     if not session:
         raise HTTPException(status_code=404, detail="Session not found.")
     index = deserialize_index(session["index"])
+    question_embedding = embedding_model.encode([request.question]).astype('float32')
     k = min(3, index.ntotal)
     distances, indices = index.search(question_embedding, k)
     relevant_chunks = [session["chunks"][i] for i in indices[0]]
     context = " ".join(relevant_chunks)
     result = qa_pipeline(question=request.question, context=context)
     logger.info(f"Query for session {session_id} answered with score: {result['score']:.4f}")
     return {
         "answer": result["answer"],
         "score": result["score"],
+        "context": context,
     }