Spaces:

Kalpokoch
/

OpenQuery

Sleeping

App Files Files Community

Kalpokoch commited on Aug 8

Commit

9a000fe

1 Parent(s): 01eba2b

updated app.py incldued phi2

Browse files

Files changed (5) hide show

app.py +76 -122
core/__init__.py +0 -0
core/chunking.py +46 -0
core/vector_store.py +38 -0
requirements.txt +7 -4

app.py CHANGED Viewed

@@ -1,181 +1,135 @@
-#
-# ---------------- Universal Data AI ----------------
-#
-# Final app.py script (v3) with robust FAISS I/O
-# Corrects previous serialization errors.
-#
-# Last updated: August 8, 2025
-#
 import logging
 import uuid
-import io  # Ensure io is imported
-# FastAPI & Pydantic
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-# Parsing Libraries
-import fitz  # PyMuPDF
 from PIL import Image
 import pytesseract
-# AI & Search Libraries
-import numpy as np
-import faiss
 from sentence_transformers import SentenceTransformer
-from transformers import pipeline
 # --- 1. INITIAL SETUP & MODEL LOADING ---
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-app = FastAPI(
-    title="Universal Data AI",
-    description="Ephemeral data analysis tool with in-memory vector search.",
-    version="1.0.1", # Version bump
-)
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
 )
 try:
     logger.info("Loading AI models...")
-    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-    qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
     logger.info("AI models loaded successfully.")
 except Exception as e:
     logger.critical(f"Fatal error: Could not load AI models. {e}")
     embedding_model = None
-    qa_pipeline = None
 SESSION_DATA = {}
 # --- 2. DATA MODELS ---
-class QueryRequest(BaseModel):
-    question: str
-class UploadResponse(BaseModel):
-    session_id: str
-    filename: str
-    chunks_created: int
-class QueryResponse(BaseModel):
-    answer: str
-    score: float
-    context: str
-# --- 3. HELPER FUNCTIONS ---
 def parse_pdf(content: bytes) -> str:
-    doc = fitz.open(stream=content, filetype="pdf")
-    return "".join(page.get_text() for page in doc)
 def parse_image(content: bytes) -> str:
-    image = Image.open(io.BytesIO(content))
-    return pytesseract.image_to_string(image)
-def chunk_text(text: str, chunk_size: int = 256, overlap: int = 32) -> list[str]:
-    words = text.split()
-    if not words: return []
-    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size - overlap)]
-# --- THIS FUNCTION IS CORRECTED ---
-def deserialize_index(serialized_index: bytes) -> faiss.Index:
-    """
-    Loads a FAISS index from its byte representation using a robust method.
-    """
-    try:
-        bio = io.BytesIO(serialized_index)
-        # Use PyCallbackIOReader to read from the in-memory binary stream
-        reader = faiss.PyCallbackIOReader(bio.read)
-        return faiss.read_index(reader)
-    except Exception as e:
-        logger.error(f"Failed to deserialize FAISS index: {e}")
-        raise
 # --- 4. API ENDPOINTS ---
 @app.get("/")
-def read_root():
-    return {"status": "ok", "message": "Welcome to Universal Data AI"}
 @app.post("/upload", response_model=UploadResponse)
 async def upload_file(file: UploadFile = File(...)):
-    if not embedding_model:
-        raise HTTPException(status_code=503, detail="AI models are not available.")
     session_id = str(uuid.uuid4())
-    logger.info(f"Upload received for session {session_id}: {file.filename}")
     content = await file.read()
     content_type = file.content_type
     if content_type == "application/pdf": text = parse_pdf(content)
     elif content_type and content_type.startswith("image/"): text = parse_image(content)
-    elif content_type == "text/plain": text = content.decode("utf-8")
-    else: raise HTTPException(status_code=400, detail=f"Unsupported file type: {content_type}")
-    if not text.strip():
-        raise HTTPException(status_code=400, detail="No text could be extracted from the file.")
-    text_chunks = chunk_text(text)
-    if not text_chunks:
-        raise HTTPException(status_code=400, detail="Document too short to be processed.")
-    embeddings = embedding_model.encode(text_chunks, convert_to_numpy=True).astype('float32')
-    index = faiss.IndexFlatL2(embeddings.shape[1])
-    index.add(embeddings)
-    # --- THIS SECTION IS CORRECTED ---
-    try:
-        # Use PyCallbackIOWriter to write the index to an in-memory binary stream
-        bio = io.BytesIO()
-        writer = faiss.PyCallbackIOWriter(bio.write)
-        faiss.write_index(index, writer)
-        serialized_index = bio.getvalue()
-    except Exception as e:
-        logger.error(f"Failed to serialize FAISS index: {e}")
-        raise HTTPException(status_code=500, detail="Failed to create document index.")
-    SESSION_DATA[session_id] = {
-        "chunks": text_chunks,
-        "index": serialized_index, # Store the index as bytes
-    }
     logger.info(f"Session {session_id} created with {len(text_chunks)} chunks.")
     return {"session_id": session_id, "filename": file.filename, "chunks_created": len(text_chunks)}
 @app.post("/query/{session_id}", response_model=QueryResponse)
 async def query_session(session_id: str, request: QueryRequest):
-    if not qa_pipeline or not embedding_model:
         raise HTTPException(status_code=503, detail="AI models are not available.")
     session = SESSION_DATA.get(session_id)
     if not session:
         raise HTTPException(status_code=404, detail="Session not found.")
-    index = deserialize_index(session["index"])
-    question_embedding = embedding_model.encode([request.question]).astype('float32')
-    k = min(3, index.ntotal)
     distances, indices = index.search(question_embedding, k)
-    relevant_chunks = [session["chunks"][i] for i in indices[0]]
-    context = " ".join(relevant_chunks)
-    result = qa_pipeline(question=request.question, context=context)
-    logger.info(f"Query for session {session_id} answered with score: {result['score']:.4f}")
-    return {
-        "answer": result["answer"],
-        "score": result["score"],
-        "context": context,
-    }

+# app.py
 import logging
 import uuid
+import io
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+# Import from our core modules
+from core.chunking import semantic_chunker
+from core.vector_store import create_faiss_index, deserialize_faiss_index
+# Parsing and AI libraries
+import fitz
 from PIL import Image
 import pytesseract
 from sentence_transformers import SentenceTransformer
+from ctransformers import AutoModel  # NEW: For running quantized GGUF models
 # --- 1. INITIAL SETUP & MODEL LOADING ---
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+app = FastAPI(title="Generative Universal Data AI", version="3.0.0")
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
 )
+# --- Load Models ---
 try:
     logger.info("Loading AI models...")
+    # Model for creating vector embeddings (remains the same)
+    embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
+    # NEW: Loading the quantized Phi-2 model using ctransformers
+    # This downloads a GGUF model file, optimized for CPU inference.
+    # Q4_K_M is a good balance of quality and performance.
+    llm = AutoModel.from_pretrained(
+        "TheBloke/phi-2-GGUF",
+        model_file="phi-2.Q4_K_M.gguf"
+    )
     logger.info("AI models loaded successfully.")
 except Exception as e:
     logger.critical(f"Fatal error: Could not load AI models. {e}")
     embedding_model = None
+    llm = None
 SESSION_DATA = {}
 # --- 2. DATA MODELS ---
+class QueryRequest(BaseModel): question: str
+class UploadResponse(BaseModel): session_id: str; filename: str; chunks_created: int
+# Modified response to reflect generative model output
+class QueryResponse(BaseModel): answer: str; context: str
+# --- 3. HELPER FUNCTIONS --- (No changes here)
 def parse_pdf(content: bytes) -> str:
+    doc = fitz.open(stream=content, filetype="pdf"); return "".join(page.get_text() for page in doc)
 def parse_image(content: bytes) -> str:
+    image = Image.open(io.BytesIO(content)); return pytesseract.image_to_string(image)
 # --- 4. API ENDPOINTS ---
 @app.get("/")
+def read_root(): return {"status": "ok", "message": "Welcome to the Generative Universal Data AI"}
 @app.post("/upload", response_model=UploadResponse)
 async def upload_file(file: UploadFile = File(...)):
+    # This endpoint remains largely the same, using the BGE model and semantic chunking
+    if not embedding_model: raise HTTPException(status_code=503, detail="Embedding model not available.")
+    # ... (the rest of the upload logic is identical to the previous version)
     session_id = str(uuid.uuid4())
     content = await file.read()
     content_type = file.content_type
     if content_type == "application/pdf": text = parse_pdf(content)
     elif content_type and content_type.startswith("image/"): text = parse_image(content)
+    else: text = content.decode("utf-8")
+    if not text.strip(): raise HTTPException(status_code=400, detail="No text could be extracted.")
+    text_chunks = semantic_chunker(text, embedding_model)
+    if not text_chunks: raise HTTPException(status_code=400, detail="Document too short to be processed.")
+    embeddings = embedding_model.encode(text_chunks, convert_to_numpy=True)
+    serialized_index = create_faiss_index(embeddings)
+    if not serialized_index: raise HTTPException(status_code=500, detail="Failed to create document index.")
+    SESSION_DATA[session_id] = {"chunks": text_chunks, "index": serialized_index}
     logger.info(f"Session {session_id} created with {len(text_chunks)} chunks.")
     return {"session_id": session_id, "filename": file.filename, "chunks_created": len(text_chunks)}
 @app.post("/query/{session_id}", response_model=QueryResponse)
 async def query_session(session_id: str, request: QueryRequest):
+    # --- THIS ENDPOINT IS COMPLETELY REWORKED FOR PHI-2 ---
+    if not llm or not embedding_model:
         raise HTTPException(status_code=503, detail="AI models are not available.")
     session = SESSION_DATA.get(session_id)
     if not session:
         raise HTTPException(status_code=404, detail="Session not found.")
+    # Step 1: Retrieve relevant context (same as before)
+    query_with_prefix = f"Represent this sentence for searching relevant passages: {request.question}"
+    question_embedding = embedding_model.encode([query_with_prefix], convert_to_numpy=True).astype('float32')
+    index = deserialize_faiss_index(session["index"])
+    if not index: raise HTTPException(status_code=500, detail="Could not load session index.")
+    k = min(5, index.ntotal)
     distances, indices = index.search(question_embedding, k)
+    context = "\n".join([session["chunks"][i] for i in indices[0]])
+    # Step 2: Create a specific prompt for the generative model
+    # This template instructs the model on how to behave.
+    prompt = f"""
+Instruct: Use the following context to answer the question accurately. If the answer is not present in the context, say "The answer is not available in the provided document."
+Context:
+{context}
+Question: {request.question}
+Answer:"""
+    logger.info("Generating answer with Phi-2...")
+    # Step 3: Generate the answer
+    answer = llm(
+        prompt,
+        max_new_tokens=256,   # Max length of the answer
+        temperature=0.2,      # Lower temperature for more factual answers
+        stop=["\n", "Instruct:", "Question:"] # Stop generation at these tokens
+    )
+    # Generative models don't give a confidence 'score' like extractive ones.
+    # We simply return the generated text.
+    return {"answer": answer.strip(), "context": context}

core/__init__.py ADDED Viewed

File without changes

core/chunking.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# core/chunking.py
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import logging
+logger = logging.getLogger(__name__)
+def semantic_chunker(text: str, model: SentenceTransformer, similarity_threshold: float = 0.55):
+    """
+    Splits text into chunks based on semantic similarity of sentences.
+    """
+    logger.info("Starting semantic chunking...")
+    # First, split the document into sentences. A simple split by newline and period.
+    sentences = [s.strip() for s in text.replace("\n", ". ").split(".") if s.strip()]
+    if not sentences:
+        return []
+    # Generate embeddings for each sentence
+    embeddings = model.encode(sentences, convert_to_numpy=True)
+    chunks = []
+    current_chunk_sentences = [sentences[0]]
+    for i in range(1, len(sentences)):
+        # Calculate similarity between the current sentence and the previous one
+        similarity = cosine_similarity(
+            embeddings[i].reshape(1, -1),
+            embeddings[i-1].reshape(1, -1)
+        )[0, 0]
+        # If similarity is below the threshold, it's a semantic break.
+        # Finalize the current chunk and start a new one.
+        if similarity < similarity_threshold:
+            chunks.append(" ".join(current_chunk_sentences))
+            current_chunk_sentences = []
+        current_chunk_sentences.append(sentences[i])
+    # Add the last remaining chunk
+    if current_chunk_sentences:
+        chunks.append(" ".join(current_chunk_sentences))
+    logger.info(f"Semantic chunking resulted in {len(chunks)} chunks.")
+    return chunks

core/vector_store.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# core/vector_store.py
+import faiss
+import io
+import logging
+from sentence_transformers import SentenceTransformer
+logger = logging.getLogger(__name__)
+def create_faiss_index(embeddings):
+    """Creates a FAISS index from a list of embeddings."""
+    if embeddings is None or len(embeddings) == 0:
+        logger.warning("No embeddings provided to create FAISS index.")
+        return None
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(embeddings.astype('float32'))
+    # Serialize the index to bytes for in-memory storage
+    try:
+        bio = io.BytesIO()
+        writer = faiss.PyCallbackIOWriter(bio.write)
+        faiss.write_index(index, writer)
+        return bio.getvalue()
+    except Exception as e:
+        logger.error(f"Failed to serialize FAISS index: {e}")
+        return None
+def deserialize_faiss_index(index_bytes: bytes) -> faiss.Index:
+    """Deserializes a FAISS index from bytes."""
+    try:
+        bio = io.BytesIO(index_bytes)
+        reader = faiss.PyCallbackIOReader(bio.read)
+        return faiss.read_index(reader)
+    except Exception as e:
+        logger.error(f"Failed to deserialize FAISS index: {e}")
+        return None

requirements.txt CHANGED Viewed

@@ -1,10 +1,13 @@
 fastapi
-uvicorn[standard]
 python-multipart
 PyMuPDF
 Pillow
 pytesseract
 sentence-transformers
-faiss-cpu

+ctransformers>=0.2.27
 fastapi
+uvicorn
 python-multipart
+pydantic
 PyMuPDF
 Pillow
 pytesseract
 sentence-transformers
+faiss-cpu
+transformers
+torch
+scikit-learn