Spaces:

Kalpokoch
/

OpenQuery

Sleeping

App Files Files Community

Kalpokoch commited on Aug 8

Commit

bc8a612

1 Parent(s): d1af85f

added vector embedding and query endpoint

Browse files

Files changed (1) hide show

app.py +157 -67

app.py CHANGED Viewed

@@ -1,101 +1,191 @@
-from fastapi import FastAPI, UploadFile, File, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-import uuid
 import logging
 import io
-import fitz
 from PIL import Image
 import pytesseract
-import numpy as np
-# NEW: Import AI and search libraries
-from sentence_transformers import SentenceTransformer
 import faiss
-# --- Basic Setup (Logging, FastAPI, CORS) ---
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-app = FastAPI()
 app.add_middleware(
-    CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
 )
-# --- AI MODEL LOADING ---
-# This happens only once when the app starts.
-# 'all-MiniLM-L6-v2' is a great, lightweight model for CPU.
 try:
-    logger.info("Loading sentence-transformer model...")
-    model = SentenceTransformer('all-MiniLM-L6-v2')
-    logger.info("Model loaded successfully.")
 except Exception as e:
-    logger.error(f"Failed to load sentence-transformer model: {e}")
-    model = None
-# In-memory session store
 SESSION_DATA = {}
-# --- Parsing Functions (parse_pdf, parse_image - keep these as they are) ---
-def parse_pdf(content: bytes) -> str: # ... your existing function ...
-def parse_image(content: bytes) -> str: # ... your existing function ...
-# --- NEW: Helper function for chunking text ---
 def chunk_text(text: str, chunk_size: int = 256, overlap: int = 32) -> list[str]:
     """Splits text into overlapping chunks of words."""
     words = text.split()
-    if not words:
-        return []
-    chunks = []
-    for i in range(0, len(words), chunk_size - overlap):
-        chunk = " ".join(words[i:i + chunk_size])
-        chunks.append(chunk)
-    return chunks
-# --- MODIFIED: The /upload Endpoint ---
-@app.post("/upload")
 async def upload_file(file: UploadFile = File(...)):
-    if not model:
-        raise HTTPException(status_code=503, detail="AI model is not available.")
     session_id = str(uuid.uuid4())
-    logger.info(f"New upload '{file.filename}'. Creating session_id: {session_id}")
     content = await file.read()
-    # 1. PARSE (This part is the same as before)
-    extracted_text = ""
-    if file.content_type == "application/pdf": extracted_text = parse_pdf(content)
-    elif file.content_type and file.content_type.startswith("image/"): extracted_text = parse_image(content)
-    elif file.content_type == "text/plain": extracted_text = content.decode("utf-8")
-    else: raise HTTPException(status_code=400, detail=f"Unsupported file type: {file.content_type}")
-    if not extracted_text.strip():
-        raise HTTPException(status_code=400, detail="Could not extract any text from the file.")
-    # 2. CHUNK
-    text_chunks = chunk_text(extracted_text)
-    logger.info(f"Text chunked into {len(text_chunks)} pieces.")
-    if not text_chunks:
-        raise HTTPException(status_code=400, detail="Document is empty or too short to be chunked.")
-    # 3. EMBED
-    logger.info("Generating embeddings for text chunks...")
-    embeddings = model.encode(text_chunks, convert_to_numpy=True)
-    logger.info(f"Embeddings generated with shape: {embeddings.shape}")
-    # 4. INDEX
-    d = embeddings.shape[1]  # Dimension of embeddings
-    index = faiss.IndexFlatL2(d)
-    index.add(embeddings.astype('float32')) # FAISS requires float32
-    logger.info(f"FAISS index created with {index.ntotal} vectors.")
-    # Store the index AND the original text chunks in the session
     SESSION_DATA[session_id] = {
-        "filename": file.filename,
         "chunks": text_chunks,
-        "index": index.serialize()  # Serialize the index for storage
     }
     return {
-        "session_id": session_id,
-        "filename": file.filename,
-        "chunks_created": len(text_chunks)
     }

+#
+# ---------------- Universal Data AI ----------------
+#
+# Final app.py script
+# Combines:
+# 1. File Upload & Parsing (PDF, Image, Text)
+# 2. Text Chunking
+# 3. Vector Embedding & FAISS Indexing
+# 4. A Query Endpoint for Question Answering
+#
+# Last updated: August 8, 2025
+#
 import logging
+import uuid
 import io
+# FastAPI & Pydantic
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+# Parsing Libraries
+import fitz  # PyMuPDF
 from PIL import Image
 import pytesseract
+# AI & Search Libraries
+import numpy as np
 import faiss
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+# --- 1. INITIAL SETUP & MODEL LOADING ---
+# Configure logging to see outputs in Hugging Face Space logs
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Initialize FastAPI app
+app = FastAPI(
+    title="Universal Data AI",
+    description="Ephemeral data analysis tool with in-memory vector search.",
+    version="1.0.0",
+)
+# Add CORS middleware to allow frontend requests
 app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allow all for simplicity, can be restricted to your frontend URL
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
 )
+# Load AI models on startup
+# This can take a moment when the app first boots.
 try:
+    logger.info("Loading AI models...")
+    # Model for creating vector embeddings
+    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+    # Pipeline for question-answering
+    qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
+    logger.info("AI models loaded successfully.")
 except Exception as e:
+    logger.critical(f"Fatal error: Could not load AI models. {e}")
+    embedding_model = None
+    qa_pipeline = None
+# In-memory dictionary to act as our temporary session database
 SESSION_DATA = {}
+# --- 2. DATA MODELS ---
+class QueryRequest(BaseModel):
+    """Defines the request body for the /query endpoint."""
+    question: str
+class UploadResponse(BaseModel):
+    """Defines the response for a successful file upload."""
+    session_id: str
+    filename: str
+    chunks_created: int
+class QueryResponse(BaseModel):
+    """Defines the response for a successful query."""
+    answer: str
+    score: float
+    context: str
+# --- 3. HELPER FUNCTIONS ---
+def parse_pdf(content: bytes) -> str:
+    """Extracts text from PDF bytes."""
+    doc = fitz.open(stream=content, filetype="pdf")
+    text = "".join(page.get_text() for page in doc)
+    return text
+def parse_image(content: bytes) -> str:
+    """Extracts text from image bytes using OCR."""
+    image = Image.open(io.BytesIO(content))
+    return pytesseract.image_to_string(image)
 def chunk_text(text: str, chunk_size: int = 256, overlap: int = 32) -> list[str]:
     """Splits text into overlapping chunks of words."""
     words = text.split()
+    if not words: return []
+    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size - overlap)]
+def deserialize_index(serialized_index: bytes) -> faiss.Index:
+    """Loads a FAISS index from its byte representation."""
+    return faiss.read_index(faiss.VectorReader(serialized_index))
+# --- 4. API ENDPOINTS ---
+@app.get("/")
+def read_root():
+    """Root endpoint for health checks."""
+    return {"status": "ok", "message": "Welcome to Universal Data AI"}
+@app.post("/upload", response_model=UploadResponse)
 async def upload_file(file: UploadFile = File(...)):
+    """Handles file upload, parsing, and AI indexing."""
+    if not embedding_model:
+        raise HTTPException(status_code=503, detail="AI models are not available.")
     session_id = str(uuid.uuid4())
+    logger.info(f"Upload received for session {session_id}: {file.filename}")
     content = await file.read()
+    # Step 1: Parse content based on file type
+    content_type = file.content_type
+    if content_type == "application/pdf": text = parse_pdf(content)
+    elif content_type and content_type.startswith("image/"): text = parse_image(content)
+    elif content_type == "text/plain": text = content.decode("utf-8")
+    else: raise HTTPException(status_code=400, detail=f"Unsupported file type: {content_type}")
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="No text could be extracted from the file.")
+    # Step 2: Chunk the text
+    text_chunks = chunk_text(text)
+    if not text_chunks:
+        raise HTTPException(status_code=400, detail="Document too short to be processed.")
+    # Step 3: Generate embeddings
+    embeddings = embedding_model.encode(text_chunks, convert_to_numpy=True).astype('float32')
+    # Step 4: Create and store FAISS index
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
     SESSION_DATA[session_id] = {
         "chunks": text_chunks,
+        "index": index.serialize(), # Store the index as bytes
     }
+    logger.info(f"Session {session_id} created with {len(text_chunks)} chunks.")
+    return {"session_id": session_id, "filename": file.filename, "chunks_created": len(text_chunks)}
+@app.post("/query/{session_id}", response_model=QueryResponse)
+async def query_session(session_id: str, request: QueryRequest):
+    """Answers a question based on the indexed content of a session."""
+    if not qa_pipeline or not embedding_model:
+        raise HTTPException(status_code=503, detail="AI models are not available.")
+    # Step 1: Retrieve session data
+    session = SESSION_DATA.get(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail="Session not found.")
+    # Step 2: Find relevant context using vector search
+    question_embedding = embedding_model.encode([request.question]).astype('float32')
+    index = deserialize_index(session["index"])
+    # Search for the top 3 most relevant chunks
+    k = min(3, index.ntotal)
+    distances, indices = index.search(question_embedding, k)
+    relevant_chunks = [session["chunks"][i] for i in indices[0]]
+    context = " ".join(relevant_chunks)
+    # Step 3: Use the QA model to find the answer within the context
+    result = qa_pipeline(question=request.question, context=context)
+    logger.info(f"Query for session {session_id} answered with score: {result['score']:.4f}")
     return {
+        "answer": result["answer"],
+        "score": result["score"],
+        "context": context
     }