Spaces:

Subhajit01
/

NaviQRAG

Sleeping

App Files Files Community

Subhajit Chakraborty commited on Aug 24, 2025

Commit

3daa0bb

1 Parent(s): 3d8d387

1

Browse files

Files changed (3) hide show

.gitignore +1 -0
app.py +165 -0
requirements.txt +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

app.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# from annotated_types import doc
+from fastapi import FastAPI, UploadFile, File, Query, Form
+from pymongo import MongoClient
+from pymongo.collection import Collection
+from pymongo import ASCENDING
+from pymongo.operations import SearchIndexModel
+from fastapi.middleware.cors import CORSMiddleware
+import fitz
+from dotenv import load_dotenv
+# import numpy as np
+# import pytesseract
+# from pdf2image import convert_from_bytes
+import img2pdf
+# from PIL import Image
+from google import genai
+# import time
+import io
+import os
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.docstore.document import Document
+ocr_model = ocr_predictor(pretrained=True)
+app = FastAPI()
+origins = ["*"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+)
+load_dotenv()
+client = MongoClient(os.getenv("MONGO_URI"))
+db = client["NaviQ"]
+collection: Collection = db["rag_db"]
+collection.create_index([("organization_id", ASCENDING)])
+# The embedding model
+api_key = os.getenv("GEMINI_API_KEY")
+genai_client = genai.Client(api_key=api_key)
+def get_embedding(data):
+    """Generates vector embeddings for the given data."""
+    result = genai_client.models.embed_content( model="gemini-embedding-001", contents=data)
+    return result.embeddings[0].values
+def getChunks(text, chunk_size, overlap):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
+    return text_splitter.split_documents(text)
+def get_query_results(org_id, query):
+  """Gets results from a vector search query."""
+  query_embedding = get_embedding(query)
+  pipeline = [
+      {
+            "$vectorSearch": {
+              "index": "vector_index",
+              "queryVector": query_embedding,
+              "path": "embedding",
+              "exact": True,
+              "limit": 5,
+              "filter": {
+                  "organization_id": org_id
+              }
+            }
+      }, {
+            "$project": {
+              "_id": 0,
+              "text": 1
+         }
+      }
+  ]
+  results = collection.aggregate(pipeline=pipeline)
+  array_of_results = []
+  for doc in results:
+      array_of_results.append(doc)
+  return array_of_results
+def extract_text_from_doctr(result):
+    json_export = result.export()
+    text = ""
+    for page in json_export["pages"]:
+        for block in page["blocks"]:
+            for line in block["lines"]:
+                text += " ".join([w["value"] for w in line["words"]]) + "\n"
+    return text
+@app.post("/upload")
+async def upload_file(organization_id: str = Form(...), file: UploadFile = File(...)):
+    contents = await file.read()
+    doc = fitz.open(stream=contents, filetype="pdf")
+    print(doc)
+    text = ""
+    # Here the case 1
+    if file.filename.lower().endswith(".pdf"):
+        for page in doc:
+            text += page.get_text()
+        if text.strip() == "":
+            # Here I will use OCR
+            ocr_doc = DocumentFile.from_pdf(io.BytesIO(contents))
+            result = ocr_model(ocr_doc)
+            text = extract_text_from_doctr(result)
+    else:
+        pdf_bytes = img2pdf.convert(contents)
+        ocr_doc = DocumentFile.from_pdf(io.BytesIO(pdf_bytes))
+        result = ocr_model(ocr_doc)
+        text = extract_text_from_doctr(result)
+    print(text)
+    # return text
+    doc_obj = [Document(page_content=text)]
+    documents = getChunks(doc_obj, 400, 20)
+    # print(documents)
+    docs_to_insert = [{
+        "organization_id": organization_id,  # app-write id
+        "text": d.page_content,
+        "embedding": get_embedding(d.page_content)
+    } for d in documents]
+    collection.insert_many(docs_to_insert)
+    index_name="vector_index"
+    search_index_model = SearchIndexModel(
+    definition = {
+        "fields": [
+            {
+                "type": "vector",
+                "numDimensions": 3072,
+                "path": "embedding",
+                "similarity": "cosine"
+            },
+            {
+                "type": "filter",
+                "path": "organization_id"
+            }
+        ]
+    },
+    name = index_name,
+    type = "vectorSearch"
+    )
+    # collection.create_search_index(model=search_index_model)
+    try:
+        collection.create_search_index(model=search_index_model)
+    except Exception:
+        pass
+    return {"message": "File uploaded successfully"}
+@app.get("/query")
+async def query(organization_id: str = Query(...), question: str = Query(...)):
+    context_docs = get_query_results(organization_id, question)
+    context_string = " ".join([doc["text"] for doc in context_docs])
+    prompt = f"""Use the following pieces of context to answer the question at the end.
+    {context_string}
+    Question: {question}
+    """
+    response = genai_client.models.generate_content(model='gemini-2.5-flash', contents=prompt)
+    return response.text

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+pymongo
+python-dotenv
+python-doctr
+langchain
+pymupdf
+img2pdf
+google-genai
+uvicorn