Spaces:

Sameer-Handsome173
/

Multi_Modal_RAG

Sleeping

App Files Files Community

Sameer-Handsome173 commited on Oct 28, 2025

Commit

c56a43d

verified ·

1 Parent(s): 31e3146

Update query_service.py

Browse files

Files changed (1) hide show

query_service.py +342 -373

query_service.py CHANGED Viewed

@@ -1,373 +1,342 @@
-import os
-import json
-import requests
-import base64
-from fastapi import FastAPI
-from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings import SentenceTransformerEmbeddings
-from langchain_core.documents import Document
-# Custom JSONFileStore
-class JSONFileStore:
-    def __init__(self, store_path: str):
-        self.store_path = store_path
-        os.makedirs(self.store_path, exist_ok=True)
-    def mget(self, keys: list[str]) -> list[Document]:
-        """Retrieve multiple documents by their keys."""
-        documents = []
-        for key in keys:
-            file_path = os.path.join(self.store_path, f"{key}.json")
-            if os.path.exists(file_path):
-                try:
-                    with open(file_path, "r", encoding='utf-8') as f:
-                        doc_dict = json.load(f)
-                        documents.append(Document(
-                            page_content=doc_dict["page_content"],
-                            metadata=doc_dict["metadata"]
-                        ))
-                except Exception as e:
-                    print(f"Error loading {key}: {e}")
-                    documents.append(None)
-            else:
-                documents.append(None)
-        return documents
-app = FastAPI(title="🔍 Multimodal RAG Query Service")
-# Paths
-VECTOR_PATH = "./vectorstore/faiss_index"
-DOCSTORE_PATH = "./docstore"
-# Final Answer API endpoint
-FINAL_ANSWER_URL = "https://sameer-handsome173-multi-modal.hf.space/final_answer"
-# Initialize embedding function
-print("🔄 Loading embedding model...")
-try:
-    embedding_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-    print("✅ Embedding model loaded")
-except Exception as e:
-    print(f"❌ Error loading embeddings: {e}")
-    raise
-# Load FAISS vectorstore
-try:
-    if os.path.exists(VECTOR_PATH):
-        vectorstore = FAISS.load_local(
-            VECTOR_PATH,
-            embedding_fn,
-            allow_dangerous_deserialization=True
-        )
-        print("✅ Loaded FAISS vectorstore")
-    else:
-        print("❌ Vectorstore not found! Please ingest documents first.")
-        raise FileNotFoundError("Vectorstore not found")
-except Exception as e:
-    print(f"❌ Error loading vectorstore: {e}")
-    raise
-# Load JSONFileStore
-try:
-    if not os.path.exists(DOCSTORE_PATH):
-        print("❌ Docstore not found! Please ingest documents first.")
-        raise FileNotFoundError("Docstore not found")
-    store = JSONFileStore(DOCSTORE_PATH)
-    print("✅ Loaded JSONFileStore")
-except Exception as e:
-    print(f"❌ Error loading docstore: {e}")
-    raise
-def parse_docs(docs: list[Document]) -> dict:
-    """
-    Split retrieved documents into texts, tables, and images.
-    """
-    images = []
-    texts = []
-    tables = []
-    for doc in docs:
-        doc_type = doc.metadata.get("type", "text")
-        if doc_type == "image" and doc.metadata.get("is_base64"):
-            try:
-                # Validate it's base64
-                base64.b64decode(doc.page_content)
-                images.append(doc.page_content)
-            except Exception:
-                # If decoding fails, treat as text
-                texts.append(doc.page_content)
-        elif doc_type == "table":
-            tables.append(doc.page_content)
-        else:
-            # Regular text
-            texts.append(doc.page_content)
-    return {
-        "images": images,
-        "texts": texts,
-        "tables": tables
-    }
-def retrieve_documents(query: str, k: int = 3) -> list[Document]:
-    """
-    Retrieve documents:
-    1. Search vectorstore for similar summaries
-    2. Get unique doc_ids from results
-    3. Retrieve original documents from docstore
-    """
-    try:
-        similar_docs = vectorstore.similarity_search(query, k=k)
-        if not similar_docs:
-            print("⚠️ No similar documents found")
-            return []
-        # Ensure unique doc_ids
-        doc_ids = []
-        for doc in similar_docs:
-            doc_id = doc.metadata.get("doc_id")
-            if doc_id and doc_id not in doc_ids:  # prevent duplicates
-                doc_ids.append(doc_id)
-        if not doc_ids:
-            print("⚠️ No doc_ids found in metadata")
-            return []
-        print(f"🔑 Found {len(doc_ids)} unique doc_ids")
-        # Retrieve original documents
-        original_docs = store.mget(doc_ids)
-        original_docs = [doc for doc in original_docs if doc is not None]
-        print(f"📄 Retrieved {len(original_docs)} unique documents")
-        return original_docs
-    except Exception as e:
-        print(f"❌ Error in retrieval: {e}")
-        return []
-def build_context_and_images(docs_by_type: dict) -> tuple[str, list[str]]:
-    """
-    Build context text from texts and tables, and collect image base64 strings.
-    Returns: (context_text, list_of_base64_images)
-    """
-    context_parts = []
-    # Add text documents
-    if docs_by_type["texts"]:
-        for i, text_content in enumerate(docs_by_type["texts"], 1):
-            context_parts.append(f"--- Text Document {i} ---\n{text_content}")
-    # Add table documents
-    if docs_by_type["tables"]:
-        for i, table_content in enumerate(docs_by_type["tables"], 1):
-            context_parts.append(f"--- Table {i} ---\n{table_content}")
-    context_text = "\n\n".join(context_parts)
-    # Get images
-    images_b64 = docs_by_type["images"]
-    return context_text.strip(), images_b64
-def call_final_answer_endpoint(context: str, question: str, images_b64: list[str]) -> dict:
-    """
-    Call the /final_answer endpoint with context, question, and images.
-    """
-    try:
-        # Prepare form data
-        data = {
-            "context": context,
-            "question": question
-        }
-        # Prepare image files if any
-        files = []
-        if images_b64:
-            for i, img_b64 in enumerate(images_b64):
-                try:
-                    # Decode base64 to bytes
-                    img_bytes = base64.b64decode(img_b64)
-                    # Add to files list
-                    files.append(("images", (f"image_{i}.jpg", img_bytes, "image/jpeg")))
-                except Exception as e:
-                    print(f"⚠️ Error processing image {i}: {str(e)}")
-        # Make request
-        if files:
-            response = requests.post(
-                FINAL_ANSWER_URL,
-                data=data,
-                files=files,
-                timeout=150
-            )
-        else:
-            response = requests.post(
-                FINAL_ANSWER_URL,
-                data=data,
-                timeout=150
-            )
-        if response.status_code == 200:
-            return response.json()
-        else:
-            return {
-                "error": f"API returned status {response.status_code}",
-                "details": response.text
-            }
-    except Exception as e:
-        return {
-            "error": f"Error calling final_answer endpoint: {str(e)}"
-        }
-@app.get("/")
-def home():
-    return {
-        "message": "✅ Multimodal RAG Query Service is running",
-        "endpoints": {
-            "query": "POST /query?question=YOUR_QUESTION&k=5",
-            "query_with_details": "POST /query_with_details?question=YOUR_QUESTION&k=5",
-            "stats": "GET /stats"
-        },
-        "features": ["Text retrieval", "Table retrieval", "Image retrieval", "Multimodal querying"]
-    }
-@app.get("/stats")
-def get_stats():
-    """Get system statistics"""
-    try:
-        vector_count = vectorstore.index.ntotal if hasattr(vectorstore, 'index') else 0
-        docstore_files = len([f for f in os.listdir(DOCSTORE_PATH) if f.endswith('.json')]) if os.path.exists(DOCSTORE_PATH) else 0
-        return {
-            "status": "ready",
-            "vectorstore_count": vector_count,
-            "docstore_count": docstore_files
-        }
-    except Exception as e:
-        return {"status": "error", "error": str(e)}
-@app.post("/query")
-async def query_rag(question: str, k: int = 5):
-    """
-    Query the Multimodal RAG system:
-    1. Search vectorstore for relevant summaries
-    2. Retrieve original documents (text + tables + images)
-    3. Parse into texts, tables, and images
-    4. Call final_answer endpoint with all content
-    5. Return answer
-    """
-    try:
-        print(f"\n🔍 Query: {question}")
-        # Retrieve documents
-        docs = retrieve_documents(question, k=k)
-        if not docs:
-            return {
-                "question": question,
-                "answer": "No relevant documents found. Please ingest documents first.",
-                "retrieved_docs": 0
-            }
-        # Parse documents into texts, tables, and images
-        docs_by_type = parse_docs(docs)
-        print(f"📊 Parsed: {len(docs_by_type['texts'])} texts, {len(docs_by_type['tables'])} tables, {len(docs_by_type['images'])} images")
-        # Build context and collect images
-        context_text, images_b64 = build_context_and_images(docs_by_type)
-        # Call endpoint
-        print("🚀 Calling final_answer endpoint...")
-        result = call_final_answer_endpoint(context_text, question, images_b64)
-        # Return response
-        if "error" in result:
-            return {
-                "question": question,
-                "error": result["error"],
-                "details": result.get("details"),
-                "retrieved_docs": len(docs),
-                "context_preview": context_text[:300] if context_text else "No context"
-            }
-        return {
-            "question": question,
-            "answer": result.get("response", "No response generated"),
-            "retrieved_docs": len(docs),
-            "docs_info": {
-                "texts": len(docs_by_type['texts']),
-                "tables": len(docs_by_type['tables']),
-                "images": len(docs_by_type['images'])
-            },
-            "context_preview": context_text[:300] if context_text else "No context"
-        }
-    except Exception as e:
-        import traceback
-        return {
-            "question": question,
-            "error": str(e),
-            "traceback": traceback.format_exc()
-        }
-@app.post("/query_with_details")
-async def query_with_details(question: str, k: int = 5):
-    """Query with detailed document information"""
-    try:
-        print(f"\n🔍 Detailed Query: {question}")
-        # Retrieve documents
-        docs = retrieve_documents(question, k=k)
-        if not docs:
-            return {
-                "question": question,
-                "answer": "No relevant documents found.",
-                "retrieved_docs": []
-            }
-        # Parse documents
-        docs_by_type = parse_docs(docs)
-        context_text, images_b64 = build_context_and_images(docs_by_type)
-        # Call endpoint
-        result = call_final_answer_endpoint(context_text, question, images_b64)
-        # Prepare document info (without full base64 images)
-        docs_info = []
-        for doc in docs:
-            doc_info = {
-                "doc_id": doc.metadata.get("doc_id"),
-                "type": doc.metadata.get("type"),
-                "source": doc.metadata.get("source"),
-                "summary": doc.metadata.get("summary", "")[:200],
-            }
-            # Don't include full content for images
-            if doc.metadata.get("type") == "image":
-                doc_info["content"] = "[Base64 Image Data]"
-            else:
-                doc_info["content"] = doc.page_content[:300]
-            docs_info.append(doc_info)
-        return {
-            "question": question,
-            "answer": result.get("response", result.get("error", "No response")),
-            "retrieved_docs": docs_info,
-            "stats": {
-                "total_retrieved": len(docs),
-                "texts": len(docs_by_type['texts']),
-                "tables": len(docs_by_type['tables']),
-                "images": len(docs_by_type['images'])
-            }
-        }
-    except Exception as e:
-        import traceback
-        return {
-            "error": str(e),
-            "traceback": traceback.format_exc()
-        }

+import os
+import json
+import requests
+import base64
+import re
+from fastapi import FastAPI
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import SentenceTransformerEmbeddings
+from langchain_core.documents import Document
+# ───────────────────────────────────────────────
+# Configuration
+# ───────────────────────────────────────────────
+VECTOR_PATH = "./vectorstore/faiss_index"
+DOCSTORE_PATH = "./docstore"
+FINAL_ANSWER_URL = "https://sameer-handsome173-multi-modal.hf.space/final_answer"
+EXTENDED_TIMEOUT = int(os.getenv("FINAL_ANSWER_TIMEOUT", 150))
+app = FastAPI(title="🔍 Multimodal RAG Query Service")
+# ───────────────────────────────────────────────
+# JSONFileStore
+# ───────────────────────────────────────────────
+class JSONFileStore:
+    def __init__(self, store_path: str):
+        self.store_path = store_path
+        os.makedirs(self.store_path, exist_ok=True)
+    def mget(self, keys: list[str]) -> list[Document]:
+        """Retrieve multiple documents by their keys."""
+        documents = []
+        for key in keys:
+            file_path = os.path.join(self.store_path, f"{key}.json")
+            if os.path.exists(file_path):
+                try:
+                    with open(file_path, "r", encoding="utf-8") as f:
+                        doc_dict = json.load(f)
+                        documents.append(
+                            Document(page_content=doc_dict["page_content"], metadata=doc_dict["metadata"])
+                        )
+                except Exception as e:
+                    print(f"Error loading {key}: {e}")
+                    documents.append(None)
+            else:
+                documents.append(None)
+        return documents
+# ───────────────────��───────────────────────────
+# Initialize embeddings, vectorstore, docstore
+# ───────────────────────────────────────────────
+print("🔄 Loading embedding model...")
+try:
+    embedding_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+    print("✅ Embedding model loaded")
+except Exception as e:
+    print(f"❌ Error loading embeddings: {e}")
+    raise
+try:
+    if os.path.exists(VECTOR_PATH):
+        vectorstore = FAISS.load_local(VECTOR_PATH, embedding_fn, allow_dangerous_deserialization=True)
+        print("✅ Loaded FAISS vectorstore")
+    else:
+        raise FileNotFoundError("Vectorstore not found")
+except Exception as e:
+    print(f"❌ Error loading vectorstore: {e}")
+    raise
+try:
+    if not os.path.exists(DOCSTORE_PATH):
+        raise FileNotFoundError("Docstore not found")
+    store = JSONFileStore(DOCSTORE_PATH)
+    print("✅ Loaded JSONFileStore")
+except Exception as e:
+    print(f"❌ Error loading docstore: {e}")
+    raise
+# ───────────────────────────────────────────────
+# Response cleaning helper
+# ───────────────────────────────────────────────
+def clean_response_text(text: str) -> str:
+    """Clean the model's response to remove hashtags, emojis, repetitions and weird tails."""
+    if not text:
+        return text
+    # Remove hashtags and URLs
+    text = re.sub(r"#\S+", "", text)
+    text = re.sub(r"http\S+", "", text)
+    # Remove non-ASCII characters (emojis, special symbols)
+    text = text.encode("ascii", "ignore").decode()
+    # Remove repeated words sequences (e.g. "word word word")
+    text = re.sub(r"\b(\w+)( \1\b)+", r"\1", text, flags=re.IGNORECASE)
+    # Collapse multiple newlines and spaces
+    text = re.sub(r"\n{2,}", "\n", text)
+    text = re.sub(r" {2,}", " ", text).strip()
+    # Remove trailing model apology lines or noisy tails
+    text = re.sub(r"I'm sorry.*", "", text, flags=re.IGNORECASE)
+    return text.strip()
+# ───────────────────────────────────────────────
+# Helpers for parsing, retrieval and final call
+# ───────────────────────────────────────────────
+def parse_docs(docs: list[Document]) -> dict:
+    """
+    Split retrieved documents into images, texts, and tables.
+    Returns dict with lists: {"images": [...], "texts": [...], "tables": [...]}
+    """
+    images, texts, tables = [], [], []
+    for doc in docs:
+        doc_type = doc.metadata.get("type", "text")
+        if doc_type == "image" and doc.metadata.get("is_base64", False):
+            # store base64 string
+            images.append(doc.page_content)
+        elif doc_type == "table":
+            tables.append(doc.page_content)
+        else:
+            texts.append(doc.page_content)
+    return {"images": images, "texts": texts, "tables": tables}
+def retrieve_documents(query: str, k: int = 5) -> list[Document]:
+    """
+    Retrieve documents:
+    1. Search vectorstore for similar summaries
+    2. Collect unique doc_ids from results (avoid duplicates)
+    3. Retrieve originals from docstore
+    """
+    try:
+        similar_docs = vectorstore.similarity_search(query, k=k)
+        if not similar_docs:
+            print("⚠️ No similar documents found")
+            return []
+        doc_ids = []
+        for doc in similar_docs:
+            doc_id = doc.metadata.get("doc_id")
+            if doc_id and doc_id not in doc_ids:
+                doc_ids.append(doc_id)
+        if not doc_ids:
+            print("⚠️ No doc_ids found in metadata")
+            return []
+        print(f"🔑 Found {len(doc_ids)} unique doc_ids")
+        original_docs = store.mget(doc_ids)
+        original_docs = [d for d in original_docs if d is not None]
+        print(f"📄 Retrieved {len(original_docs)} unique documents")
+        return original_docs
+    except Exception as e:
+        print(f"❌ Error in retrieval: {e}")
+        return []
+def build_context_and_images(docs_by_type: dict) -> tuple[str, list[str]]:
+    """
+    Build context text from texts and tables, and collect image base64 strings.
+    Returns: (context_text, list_of_base64_images)
+    """
+    context_parts = []
+    # Add text documents
+    for i, text_content in enumerate(docs_by_type.get("texts", []), 1):
+        context_parts.append(f"--- Text Document {i} ---\n{text_content}")
+    # Add table documents
+    for i, table_content in enumerate(docs_by_type.get("tables", []), 1):
+        context_parts.append(f"--- Table {i} ---\n{table_content}")
+    context_text = "\n\n".join(context_parts).strip()
+    images_b64 = docs_by_type.get("images", [])
+    return context_text, images_b64
+def call_final_answer_endpoint(context: str, question: str, images_b64: list[str]) -> dict:
+    """
+    Call the /final_answer endpoint with context, question, and images.
+    Uses extended timeout to allow for slow multimodal inference.
+    """
+    try:
+        # Make prompt instruction clearer for concise output
+        data = {
+            "context": context,
+            "question": f"Answer concisely and without hashtags or emojis.\n\nQuestion: {question}"
+        }
+        files = []
+        if images_b64:
+            for i, img_b64 in enumerate(images_b64):
+                try:
+                    img_bytes = base64.b64decode(img_b64)
+                    files.append(("images", (f"image_{i}.jpg", img_bytes, "image/jpeg")))
+                except Exception as e:
+                    print(f"⚠️ Error decoding image {i}: {e}")
+        if files:
+            response = requests.post(FINAL_ANSWER_URL, data=data, files=files, timeout=EXTENDED_TIMEOUT)
+        else:
+            response = requests.post(FINAL_ANSWER_URL, data=data, timeout=EXTENDED_TIMEOUT)
+        if response.status_code == 200:
+            return response.json()
+        else:
+            return {"error": f"API returned status {response.status_code}", "details": response.text}
+    except Exception as e:
+        return {"error": f"Error calling final_answer endpoint: {str(e)}"}
+# ───────────────────────────────────────────────
+# FastAPI endpoints
+# ───────────────────────────────────────────────
+@app.get("/")
+def home():
+    return {
+        "message": "✅ Multimodal RAG Query Service is running",
+        "timeout_seconds": EXTENDED_TIMEOUT,
+        "endpoints": {
+            "query": "/query?question=Your+Question",
+            "query_with_details": "/query_with_details?question=Your+Question",
+            "stats": "/stats",
+        },
+    }
+@app.get("/stats")
+def get_stats():
+    try:
+        vector_count = vectorstore.index.ntotal if hasattr(vectorstore, "index") else 0
+        docstore_files = len([f for f in os.listdir(DOCSTORE_PATH) if f.endswith(".json")]) if os.path.exists(DOCSTORE_PATH) else 0
+        return {"status": "ready", "vectorstore_count": vector_count, "docstore_count": docstore_files}
+    except Exception as e:
+        return {"status": "error", "error": str(e)}
+@app.post("/query")
+async def query_rag(question: str, k: int = 5):
+    """
+    Query the Multimodal RAG system:
+    1. Search vectorstore for relevant summaries
+    2. Retrieve original documents (text + tables + images)
+    3. Parse into texts, tables, and images
+    4. Call final_answer endpoint with all content
+    5. Return cleaned answer
+    """
+    try:
+        print(f"\n🔍 Query: {question}")
+        docs = retrieve_documents(question, k=k)
+        if not docs:
+            return {"question": question, "answer": "No relevant documents found. Please ingest documents first.", "retrieved_docs": 0}
+        docs_by_type = parse_docs(docs)
+        print(f"📊 Parsed: {len(docs_by_type['texts'])} texts, {len(docs_by_type['tables'])} tables, {len(docs_by_type['images'])} images")
+        context_text, images_b64 = build_context_and_images(docs_by_type)
+        print("🚀 Calling final_answer endpoint...")
+        result = call_final_answer_endpoint(context_text, question, images_b64)
+        if "error" in result:
+            return {
+                "question": question,
+                "error": result["error"],
+                "details": result.get("details"),
+                "retrieved_docs": len(docs),
+                "context_preview": context_text[:300] if context_text else "No context"
+            }
+        cleaned_answer = clean_response_text(result.get("response", "No response generated"))
+        return {
+            "question": question,
+            "answer": cleaned_answer,
+            "retrieved_docs": len(docs),
+            "docs_info": {
+                "texts": len(docs_by_type["texts"]),
+                "tables": len(docs_by_type["tables"]),
+                "images": len(docs_by_type["images"]),
+            },
+            "context_preview": context_text[:300] if context_text else "No context",
+        }
+    except Exception as e:
+        import traceback
+        return {"question": question, "error": str(e), "traceback": traceback.format_exc()}
+@app.post("/query_with_details")
+async def query_with_details(question: str, k: int = 5):
+    """Query with detailed document information"""
+    try:
+        print(f"\n🔍 Detailed Query: {question}")
+        docs = retrieve_documents(question, k=k)
+        if not docs:
+            return {"question": question, "answer": "No relevant documents found.", "retrieved_docs": []}
+        docs_by_type = parse_docs(docs)
+        context_text, images_b64 = build_context_and_images(docs_by_type)
+        result = call_final_answer_endpoint(context_text, question, images_b64)
+        if "error" in result:
+            return {"question": question, "error": result["error"], "details": result.get("details")}
+        docs_info = []
+        for doc in docs:
+            doc_info = {
+                "doc_id": doc.metadata.get("doc_id"),
+                "type": doc.metadata.get("type"),
+                "source": doc.metadata.get("source"),
+                "summary": doc.metadata.get("summary", "")[:200],
+            }
+            doc_info["content"] = "[Base64 Image Data]" if doc.metadata.get("type") == "image" else doc.page_content[:300]
+            docs_info.append(doc_info)
+        cleaned_answer = clean_response_text(result.get("response", "No response generated"))
+        return {
+            "question": question,
+            "answer": cleaned_answer,
+            "retrieved_docs": docs_info,
+            "stats": {
+                "total_retrieved": len(docs),
+                "texts": len(docs_by_type["texts"]),
+                "tables": len(docs_by_type["tables"]),
+                "images": len(docs_by_type["images"]),
+            },
+        }
+    except Exception as e:
+        import traceback
+        return {"error": str(e), "traceback": traceback.format_exc()}