NaviQRAG / app.py
Subhajit01's picture
update app.py(1)
4f8f5c3 verified
# from annotated_types import doc
from flask import Flask, request, jsonify
from flask_cors import CORS
from pymongo import MongoClient
from pymongo.collection import Collection
from pymongo import ASCENDING
from pymongo.operations import SearchIndexModel
from fastapi.middleware.cors import CORSMiddleware
import fitz
from dotenv import load_dotenv
# import numpy as np
# import pytesseract
# from pdf2image import convert_from_bytes
import img2pdf
# from PIL import Image
from google import genai
# import time
import io
import os
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
load_dotenv()
app = Flask(__name__)
CORS(app)
ocr_model = ocr_predictor(pretrained=True)
print("model downloaded")
client = MongoClient(os.getenv("MONGO_URI"))
db = client["NaviQ"]
collection: Collection = db["rag_db"]
collection.create_index([("organization_id", ASCENDING)])
# The embedding model
api_key = os.getenv("GEMINI_API_KEY")
genai_client = genai.Client(api_key=api_key)
def get_embedding(data):
"""Generates vector embeddings for the given data."""
result = genai_client.models.embed_content( model="gemini-embedding-001", contents=data)
return result.embeddings[0].values
def getChunks(text, chunk_size, overlap):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
return text_splitter.split_documents(text)
def get_query_results(org_id, query):
"""Gets results from a vector search query."""
query_embedding = get_embedding(query)
pipeline = [
{
"$vectorSearch": {
"index": "vector_index",
"queryVector": query_embedding,
"path": "embedding",
"exact": True,
"limit": 5,
"filter": {
"organization_id": org_id
}
}
}, {
"$project": {
"_id": 0,
"text": 1
}
}
]
results = collection.aggregate(pipeline=pipeline)
array_of_results = []
for doc in results:
array_of_results.append(doc)
return array_of_results
def extract_text_from_doctr(result):
json_export = result.export()
text = ""
for page in json_export["pages"]:
for block in page["blocks"]:
for line in block["lines"]:
text += " ".join([w["value"] for w in line["words"]]) + "\n"
return text
@app.route("/upload", methods=["POST"])
def upload_file():
organization_id = request.form.get("organization_id")
file = request.files.get("file")
if not file or not organization_id:
return jsonify({"error": "Missing file or organization_id"}), 400
contents = file.read()
doc = fitz.open(stream=contents, filetype="pdf")
print(doc)
text = ""
# Here the case 1
if file.filename.lower().endswith(".pdf"):
for page in doc:
text += page.get_text()
if text.strip() == "":
# Here I will use OCR
ocr_doc = DocumentFile.from_pdf(io.BytesIO(contents))
result = ocr_model(ocr_doc)
text = extract_text_from_doctr(result)
else:
pdf_bytes = img2pdf.convert(contents)
ocr_doc = DocumentFile.from_pdf(io.BytesIO(pdf_bytes))
result = ocr_model(ocr_doc)
text = extract_text_from_doctr(result)
print(text)
# return text
doc_obj = [Document(page_content=text)]
documents = getChunks(doc_obj, 400, 20)
# print(documents)
docs_to_insert = [{
"organization_id": organization_id, # app-write id
"text": d.page_content,
"embedding": get_embedding(d.page_content)
} for d in documents]
collection.insert_many(docs_to_insert)
index_name="vector_index"
search_index_model = SearchIndexModel(
definition = {
"fields": [
{
"type": "vector",
"numDimensions": 3072,
"path": "embedding",
"similarity": "cosine"
},
{
"type": "filter",
"path": "organization_id"
}
]
},
name = index_name,
type = "vectorSearch"
)
# collection.create_search_index(model=search_index_model)
try:
collection.create_search_index(model=search_index_model)
except Exception:
pass
return {"message": "File uploaded successfully"}
@app.route("/query", methods=["GET"])
def query():
organization_id = request.args.get("organization_id")
question = request.args.get("question")
context_docs = get_query_results(organization_id, question)
context_string = " ".join([doc["text"] for doc in context_docs])
prompt = f"""Use the following pieces of context to answer the question at the end.
{context_string}
Question: {question}
"""
response = genai_client.models.generate_content(model='gemini-2.5-flash', contents=prompt)
return jsonify({"answer": response.text})
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)