Spaces:

Sameer-Handsome173
/

Multi_Modal_RAG

Sleeping

App Files Files Community

Sameer-Handsome173 commited on Oct 28, 2025

Commit

2d99efe

verified ·

1 Parent(s): 9b6b721

Upload 2 files

Browse files

Files changed (2) hide show

query_service.py +373 -0
split.py +330 -0

query_service.py ADDED Viewed

	@@ -0,0 +1,373 @@

+import os
+import json
+import requests
+import base64
+from fastapi import FastAPI
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import SentenceTransformerEmbeddings
+from langchain_core.documents import Document
+# Custom JSONFileStore
+class JSONFileStore:
+    def __init__(self, store_path: str):
+        self.store_path = store_path
+        os.makedirs(self.store_path, exist_ok=True)
+    def mget(self, keys: list[str]) -> list[Document]:
+        """Retrieve multiple documents by their keys."""
+        documents = []
+        for key in keys:
+            file_path = os.path.join(self.store_path, f"{key}.json")
+            if os.path.exists(file_path):
+                try:
+                    with open(file_path, "r", encoding='utf-8') as f:
+                        doc_dict = json.load(f)
+                        documents.append(Document(
+                            page_content=doc_dict["page_content"],
+                            metadata=doc_dict["metadata"]
+                        ))
+                except Exception as e:
+                    print(f"Error loading {key}: {e}")
+                    documents.append(None)
+            else:
+                documents.append(None)
+        return documents
+app = FastAPI(title="🔍 Multimodal RAG Query Service")
+# Paths
+VECTOR_PATH = "./vectorstore/faiss_index"
+DOCSTORE_PATH = "./docstore"
+# Final Answer API endpoint
+FINAL_ANSWER_URL = "https://sameer-handsome173-multi-modal.hf.space/final_answer"
+# Initialize embedding function
+print("🔄 Loading embedding model...")
+try:
+    embedding_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+    print("✅ Embedding model loaded")
+except Exception as e:
+    print(f"❌ Error loading embeddings: {e}")
+    raise
+# Load FAISS vectorstore
+try:
+    if os.path.exists(VECTOR_PATH):
+        vectorstore = FAISS.load_local(
+            VECTOR_PATH,
+            embedding_fn,
+            allow_dangerous_deserialization=True
+        )
+        print("✅ Loaded FAISS vectorstore")
+    else:
+        print("❌ Vectorstore not found! Please ingest documents first.")
+        raise FileNotFoundError("Vectorstore not found")
+except Exception as e:
+    print(f"❌ Error loading vectorstore: {e}")
+    raise
+# Load JSONFileStore
+try:
+    if not os.path.exists(DOCSTORE_PATH):
+        print("❌ Docstore not found! Please ingest documents first.")
+        raise FileNotFoundError("Docstore not found")
+    store = JSONFileStore(DOCSTORE_PATH)
+    print("✅ Loaded JSONFileStore")
+except Exception as e:
+    print(f"❌ Error loading docstore: {e}")
+    raise
+def parse_docs(docs: list[Document]) -> dict:
+    """
+    Split retrieved documents into texts, tables, and images.
+    """
+    images = []
+    texts = []
+    tables = []
+    for doc in docs:
+        doc_type = doc.metadata.get("type", "text")
+        if doc_type == "image" and doc.metadata.get("is_base64"):
+            try:
+                # Validate it's base64
+                base64.b64decode(doc.page_content)
+                images.append(doc.page_content)
+            except Exception:
+                # If decoding fails, treat as text
+                texts.append(doc.page_content)
+        elif doc_type == "table":
+            tables.append(doc.page_content)
+        else:
+            # Regular text
+            texts.append(doc.page_content)
+    return {
+        "images": images,
+        "texts": texts,
+        "tables": tables
+    }
+def retrieve_documents(query: str, k: int = 3) -> list[Document]:
+    """
+    Retrieve documents:
+    1. Search vectorstore for similar summaries
+    2. Get unique doc_ids from results
+    3. Retrieve original documents from docstore
+    """
+    try:
+        similar_docs = vectorstore.similarity_search(query, k=k)
+        if not similar_docs:
+            print("⚠️ No similar documents found")
+            return []
+        # Ensure unique doc_ids
+        doc_ids = []
+        for doc in similar_docs:
+            doc_id = doc.metadata.get("doc_id")
+            if doc_id and doc_id not in doc_ids:  # prevent duplicates
+                doc_ids.append(doc_id)
+        if not doc_ids:
+            print("⚠️ No doc_ids found in metadata")
+            return []
+        print(f"🔑 Found {len(doc_ids)} unique doc_ids")
+        # Retrieve original documents
+        original_docs = store.mget(doc_ids)
+        original_docs = [doc for doc in original_docs if doc is not None]
+        print(f"📄 Retrieved {len(original_docs)} unique documents")
+        return original_docs
+    except Exception as e:
+        print(f"❌ Error in retrieval: {e}")
+        return []
+def build_context_and_images(docs_by_type: dict) -> tuple[str, list[str]]:
+    """
+    Build context text from texts and tables, and collect image base64 strings.
+    Returns: (context_text, list_of_base64_images)
+    """
+    context_parts = []
+    # Add text documents
+    if docs_by_type["texts"]:
+        for i, text_content in enumerate(docs_by_type["texts"], 1):
+            context_parts.append(f"--- Text Document {i} ---\n{text_content}")
+    # Add table documents
+    if docs_by_type["tables"]:
+        for i, table_content in enumerate(docs_by_type["tables"], 1):
+            context_parts.append(f"--- Table {i} ---\n{table_content}")
+    context_text = "\n\n".join(context_parts)
+    # Get images
+    images_b64 = docs_by_type["images"]
+    return context_text.strip(), images_b64
+def call_final_answer_endpoint(context: str, question: str, images_b64: list[str]) -> dict:
+    """
+    Call the /final_answer endpoint with context, question, and images.
+    """
+    try:
+        # Prepare form data
+        data = {
+            "context": context,
+            "question": question
+        }
+        # Prepare image files if any
+        files = []
+        if images_b64:
+            for i, img_b64 in enumerate(images_b64):
+                try:
+                    # Decode base64 to bytes
+                    img_bytes = base64.b64decode(img_b64)
+                    # Add to files list
+                    files.append(("images", (f"image_{i}.jpg", img_bytes, "image/jpeg")))
+                except Exception as e:
+                    print(f"⚠️ Error processing image {i}: {str(e)}")
+        # Make request
+        if files:
+            response = requests.post(
+                FINAL_ANSWER_URL,
+                data=data,
+                files=files,
+                timeout=150
+            )
+        else:
+            response = requests.post(
+                FINAL_ANSWER_URL,
+                data=data,
+                timeout=150
+            )
+        if response.status_code == 200:
+            return response.json()
+        else:
+            return {
+                "error": f"API returned status {response.status_code}",
+                "details": response.text
+            }
+    except Exception as e:
+        return {
+            "error": f"Error calling final_answer endpoint: {str(e)}"
+        }
+@app.get("/")
+def home():
+    return {
+        "message": "✅ Multimodal RAG Query Service is running",
+        "endpoints": {
+            "query": "POST /query?question=YOUR_QUESTION&k=5",
+            "query_with_details": "POST /query_with_details?question=YOUR_QUESTION&k=5",
+            "stats": "GET /stats"
+        },
+        "features": ["Text retrieval", "Table retrieval", "Image retrieval", "Multimodal querying"]
+    }
+@app.get("/stats")
+def get_stats():
+    """Get system statistics"""
+    try:
+        vector_count = vectorstore.index.ntotal if hasattr(vectorstore, 'index') else 0
+        docstore_files = len([f for f in os.listdir(DOCSTORE_PATH) if f.endswith('.json')]) if os.path.exists(DOCSTORE_PATH) else 0
+        return {
+            "status": "ready",
+            "vectorstore_count": vector_count,
+            "docstore_count": docstore_files
+        }
+    except Exception as e:
+        return {"status": "error", "error": str(e)}
+@app.post("/query")
+async def query_rag(question: str, k: int = 5):
+    """
+    Query the Multimodal RAG system:
+    1. Search vectorstore for relevant summaries
+    2. Retrieve original documents (text + tables + images)
+    3. Parse into texts, tables, and images
+    4. Call final_answer endpoint with all content
+    5. Return answer
+    """
+    try:
+        print(f"\n🔍 Query: {question}")
+        # Retrieve documents
+        docs = retrieve_documents(question, k=k)
+        if not docs:
+            return {
+                "question": question,
+                "answer": "No relevant documents found. Please ingest documents first.",
+                "retrieved_docs": 0
+            }
+        # Parse documents into texts, tables, and images
+        docs_by_type = parse_docs(docs)
+        print(f"📊 Parsed: {len(docs_by_type['texts'])} texts, {len(docs_by_type['tables'])} tables, {len(docs_by_type['images'])} images")
+        # Build context and collect images
+        context_text, images_b64 = build_context_and_images(docs_by_type)
+        # Call endpoint
+        print("🚀 Calling final_answer endpoint...")
+        result = call_final_answer_endpoint(context_text, question, images_b64)
+        # Return response
+        if "error" in result:
+            return {
+                "question": question,
+                "error": result["error"],
+                "details": result.get("details"),
+                "retrieved_docs": len(docs),
+                "context_preview": context_text[:300] if context_text else "No context"
+            }
+        return {
+            "question": question,
+            "answer": result.get("response", "No response generated"),
+            "retrieved_docs": len(docs),
+            "docs_info": {
+                "texts": len(docs_by_type['texts']),
+                "tables": len(docs_by_type['tables']),
+                "images": len(docs_by_type['images'])
+            },
+            "context_preview": context_text[:300] if context_text else "No context"
+        }
+    except Exception as e:
+        import traceback
+        return {
+            "question": question,
+            "error": str(e),
+            "traceback": traceback.format_exc()
+        }
+@app.post("/query_with_details")
+async def query_with_details(question: str, k: int = 5):
+    """Query with detailed document information"""
+    try:
+        print(f"\n🔍 Detailed Query: {question}")
+        # Retrieve documents
+        docs = retrieve_documents(question, k=k)
+        if not docs:
+            return {
+                "question": question,
+                "answer": "No relevant documents found.",
+                "retrieved_docs": []
+            }
+        # Parse documents
+        docs_by_type = parse_docs(docs)
+        context_text, images_b64 = build_context_and_images(docs_by_type)
+        # Call endpoint
+        result = call_final_answer_endpoint(context_text, question, images_b64)
+        # Prepare document info (without full base64 images)
+        docs_info = []
+        for doc in docs:
+            doc_info = {
+                "doc_id": doc.metadata.get("doc_id"),
+                "type": doc.metadata.get("type"),
+                "source": doc.metadata.get("source"),
+                "summary": doc.metadata.get("summary", "")[:200],
+            }
+            # Don't include full content for images
+            if doc.metadata.get("type") == "image":
+                doc_info["content"] = "[Base64 Image Data]"
+            else:
+                doc_info["content"] = doc.page_content[:300]
+            docs_info.append(doc_info)
+        return {
+            "question": question,
+            "answer": result.get("response", result.get("error", "No response")),
+            "retrieved_docs": docs_info,
+            "stats": {
+                "total_retrieved": len(docs),
+                "texts": len(docs_by_type['texts']),
+                "tables": len(docs_by_type['tables']),
+                "images": len(docs_by_type['images'])
+            }
+        }
+    except Exception as e:
+        import traceback
+        return {
+            "error": str(e),
+            "traceback": traceback.format_exc()
+        }

split.py ADDED Viewed

	@@ -0,0 +1,330 @@

+import os
+import json
+import uuid
+import requests
+import base64
+import fitz  # PyMuPDF
+from fastapi import FastAPI, UploadFile, File
+from pypdf import PdfReader
+import pdfplumber
+from PIL import Image
+import io
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import SentenceTransformerEmbeddings
+from langchain_core.documents import Document
+# ================= JSON File Store =================
+class JSONFileStore:
+    def __init__(self, store_path: str):
+        self.store_path = store_path
+        os.makedirs(self.store_path, exist_ok=True)
+    def mset(self, key_value_pairs: list[tuple[str, Document]]) -> None:
+        for key, doc in key_value_pairs:
+            file_path = os.path.join(self.store_path, f"{key}.json")
+            doc_dict = {"page_content": doc.page_content, "metadata": doc.metadata}
+            with open(file_path, "w", encoding="utf-8") as f:
+                json.dump(doc_dict, f, ensure_ascii=False)
+    def mget(self, keys: list[str]) -> list[Document]:
+        documents = []
+        for key in keys:
+            file_path = os.path.join(self.store_path, f"{key}.json")
+            if os.path.exists(file_path):
+                try:
+                    with open(file_path, "r", encoding="utf-8") as f:
+                        doc_dict = json.load(f)
+                        documents.append(
+                            Document(
+                                page_content=doc_dict["page_content"],
+                                metadata=doc_dict["metadata"],
+                            )
+                        )
+                except Exception as e:
+                    print(f"Error loading {key}: {e}")
+                    documents.append(None)
+            else:
+                documents.append(None)
+        return documents
+# ================= FastAPI Setup =================
+app = FastAPI(title="🚀 Multimodal RAG Ingestion Service (Text + Tables + Images)")
+VECTOR_PATH = "./vectorstore/faiss_index"
+DOCSTORE_PATH = "./docstore"
+TEMP_DOCS_PATH = "./docs"
+QWEN_TEXT_URL = "https://sameer-handsome173-multi-modal.hf.space/summarize_qwen"
+BLIP_IMAGE_URL = "https://sameer-handsome173-multi-modal.hf.space/summarize_smol"
+print("🔄 Loading embedding model...")
+embedding_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+print("✅ Embedding model loaded")
+# Load or create vectorstore
+if os.path.exists(VECTOR_PATH):
+    vectorstore = FAISS.load_local(
+        VECTOR_PATH, embedding_fn, allow_dangerous_deserialization=True
+    )
+    print("✅ Loaded existing FAISS vectorstore")
+else:
+    os.makedirs(os.path.dirname(VECTOR_PATH), exist_ok=True)
+    vectorstore = FAISS.from_texts(["init"], embedding_fn)
+    print("✅ Created new FAISS vectorstore")
+# Initialize JSON store
+os.makedirs(DOCSTORE_PATH, exist_ok=True)
+store = JSONFileStore(DOCSTORE_PATH)
+print("✅ Initialized JSONFileStore")
+# ================= Extraction Functions =================
+def extract_tables_from_pdf(pdf_path: str) -> list[str]:
+    tables = []
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            for page_num, page in enumerate(pdf.pages):
+                page_tables = page.extract_tables()
+                if page_tables:
+                    for table_idx, table in enumerate(page_tables):
+                        table_str = f"Table from page {page_num + 1}:\n"
+                        for row in table:
+                            if row:
+                                table_str += " | ".join(
+                                    [str(cell) if cell else "" for cell in row]
+                                ) + "\n"
+                        tables.append(table_str)
+                        print(f"📊 Extracted table from page {page_num + 1}")
+    except Exception as e:
+        print(f"⚠️ Error extracting tables: {e}")
+    return tables
+def extract_text_from_pdf(pdf_path: str) -> list[dict]:
+    """Extract text per page"""
+    texts = []
+    try:
+        reader = PdfReader(pdf_path)
+        for i, page in enumerate(reader.pages):
+            text = page.extract_text()
+            if text and text.strip():
+                texts.append({"page": i + 1, "content": text.strip()})
+                print(f"📝 Extracted text from page {i+1}")
+    except Exception as e:
+        print(f"❌ Error extracting text: {e}")
+    return texts
+import hashlib
+def extract_images_from_pdf(pdf_path: str) -> list[str]:
+    """Extract large, unique images from PDF as base64"""
+    images_b64 = []
+    image_hashes = set()
+    try:
+        reader = PdfReader(pdf_path)
+        for page_num, page in enumerate(reader.pages):
+            if '/XObject' not in page['/Resources']:
+                continue
+            xObject = page['/Resources']['/XObject'].get_object()
+            for obj in xObject:
+                if xObject[obj]['/Subtype'] == '/Image':
+                    try:
+                        width = xObject[obj]['/Width']
+                        height = xObject[obj]['/Height']
+                        if width < 100 or height < 100:
+                            continue  # skip small images
+                        data = xObject[obj].get_data()
+                        h = hashlib.md5(data).hexdigest()
+                        if h in image_hashes:
+                            continue  # skip duplicates
+                        image_hashes.add(h)
+                        mode = "RGB" if xObject[obj]['/ColorSpace'] == '/DeviceRGB' else "P"
+                        image = Image.frombytes(mode, (width, height), data)
+                        buffered = io.BytesIO()
+                        image.save(buffered, format="JPEG")
+                        img_b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+                        images_b64.append(img_b64)
+                        print(f"📸 Extracted image from page {page_num+1} ({width}x{height})")
+                    except Exception as e:
+                        print(f"⚠️ Error extracting image from page {page_num+1}: {e}")
+    except Exception as e:
+        print(f"❌ Error extracting images: {e}")
+    return images_b64
+# ================= Summarization =================
+def summarize_text(content: str) -> str:
+    try:
+        response = requests.post(
+            QWEN_TEXT_URL,
+            data={"prompt": f"Summarize the following content:\n\n{content}"},
+            timeout=30,
+        )
+        if response.status_code == 200:
+            return response.json().get("response", content[:200])
+        else:
+            return content[:200]
+    except Exception as e:
+        print(f"⚠️ Text summary fallback: {e}")
+        return content[:200]
+def summarize_image(image_b64: str) -> str:
+    try:
+        image_bytes = base64.b64decode(image_b64)
+        files = {"image": ("image.jpg", image_bytes, "image/jpeg")}
+        data = {"text": "Describe this image in detail"}
+        response = requests.post(BLIP_IMAGE_URL, files=files, data=data, timeout=30)
+        if response.status_code == 200:
+            return response.json().get("response", "No image summary generated")
+        return "Image extracted from PDF"
+    except Exception as e:
+        print(f"⚠️ Image summary fallback: {e}")
+        return "Image extracted from PDF"
+# ================= FastAPI Endpoints =================
+@app.get("/")
+def home():
+    return {
+        "message": "✅ Multimodal RAG Ingestion Service is running",
+        "endpoints": {
+            "ingest": "POST /ingest - Upload PDF file",
+            "stats": "GET /stats - View system statistics",
+        },
+    }
+@app.get("/stats")
+def get_stats():
+    vector_count = (
+        vectorstore.index.ntotal if hasattr(vectorstore, "index") else 0
+    )
+    docstore_files = (
+        len([f for f in os.listdir(DOCSTORE_PATH) if f.endswith(".json")])
+        if os.path.exists(DOCSTORE_PATH)
+        else 0
+    )
+    return {
+        "status": "healthy",
+        "vectorstore_count": vector_count,
+        "docstore_count": docstore_files,
+    }
+@app.post("/ingest")
+async def ingest_pdf(file: UploadFile = File(...)):
+    if not file.filename.endswith(".pdf"):
+        return {"error": "Only PDF files are supported"}
+    os.makedirs(TEMP_DOCS_PATH, exist_ok=True)
+    temp_path = os.path.join(TEMP_DOCS_PATH, file.filename)
+    with open(temp_path, "wb") as f:
+        content = await file.read()
+        f.write(content)
+    print(f"\n📄 Processing {file.filename}...")
+    texts = extract_text_from_pdf(temp_path)
+    images = extract_images_from_pdf(temp_path)
+    tables = extract_tables_from_pdf(temp_path)
+    print(f"📊 Found: {len(texts)} texts, {len(tables)} tables, {len(images)} images")
+    if not texts and not tables and not images:
+        return {"error": "No content extracted", "filename": file.filename}
+    doc_ids, summaries, originals = [], [], []
+    # Texts
+    for i, item in enumerate(texts):
+        page_num = item["page"]
+        content = item["content"]
+        summary = summarize_text(content)
+        doc_id = str(uuid.uuid4())
+        doc_ids.append(doc_id)
+        summaries.append(summary)
+        originals.append(
+            Document(
+                page_content=content,
+                metadata={
+                    "doc_id": doc_id,
+                    "type": "text",
+                    "page": page_num,
+                    "source": file.filename,
+                    "summary": summary,
+                },
+            )
+        )
+    # Tables
+    for table in tables:
+        summary = summarize_text(f"Table content:\n{table}")
+        doc_id = str(uuid.uuid4())
+        doc_ids.append(doc_id)
+        summaries.append(summary)
+        originals.append(
+            Document(
+                page_content=table,
+                metadata={
+                    "doc_id": doc_id,
+                    "type": "table",
+                    "source": file.filename,
+                    "summary": summary,
+                },
+            )
+        )
+    # Images
+    for i, item in enumerate(images):
+        page_num = item["page"]
+        img_b64 = item["image_b64"]
+        summary = summarize_image(img_b64)
+        doc_id = str(uuid.uuid4())
+        doc_ids.append(doc_id)
+        summaries.append(summary)
+        originals.append(
+            Document(
+                page_content=img_b64,
+                metadata={
+                    "doc_id": doc_id,
+                    "type": "image",
+                    "page": page_num,
+                    "source": file.filename,
+                    "summary": summary,
+                    "is_base64": True,
+                },
+            )
+        )
+    # Store
+    vectorstore.add_texts(
+        texts=summaries,
+        metadatas=[{"doc_id": doc_id, "source": file.filename} for doc_id in doc_ids],
+        ids=doc_ids,
+    )
+    store.mset(list(zip(doc_ids, originals)))
+    vectorstore.save_local(VECTOR_PATH)
+    print("✅ Saved to disk")
+    os.remove(temp_path)
+    return {
+        "status": "success",
+        "filename": file.filename,
+        "processed": {
+            "texts": len(texts),
+            "tables": len(tables),
+            "images": len(images),
+            "total": len(originals),
+        },
+        "doc_ids_sample": doc_ids[:5],
+        "message": f"✅ Processed {len(originals)} components from {file.filename}",
+    }