Spaces:

Subhajit01
/

NaviQRAG

Sleeping

App Files Files Community

NaviQRAG / app.py

Subhajit01

update app.py(1)

4f8f5c3 verified 6 months ago

raw

history blame contribute delete

5.2 kB

	# from annotated_types import doc
	from flask import Flask, request, jsonify
	from flask_cors import CORS
	from pymongo import MongoClient
	from pymongo.collection import Collection
	from pymongo import ASCENDING
	from pymongo.operations import SearchIndexModel
	from fastapi.middleware.cors import CORSMiddleware
	import fitz
	from dotenv import load_dotenv
	# import numpy as np
	# import pytesseract
	# from pdf2image import convert_from_bytes
	import img2pdf
	# from PIL import Image
	from google import genai
	# import time
	import io
	import os
	from doctr.io import DocumentFile
	from doctr.models import ocr_predictor
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.docstore.document import Document

	load_dotenv()

	app = Flask(__name__)
	CORS(app)
	ocr_model = ocr_predictor(pretrained=True)
	print("model downloaded")

	client = MongoClient(os.getenv("MONGO_URI"))
	db = client["NaviQ"]
	collection: Collection = db["rag_db"]
	collection.create_index([("organization_id", ASCENDING)])

	# The embedding model
	api_key = os.getenv("GEMINI_API_KEY")
	genai_client = genai.Client(api_key=api_key)

	def get_embedding(data):
	"""Generates vector embeddings for the given data."""
	result = genai_client.models.embed_content( model="gemini-embedding-001", contents=data)
	return result.embeddings[0].values

	def getChunks(text, chunk_size, overlap):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
	return text_splitter.split_documents(text)

	def get_query_results(org_id, query):
	"""Gets results from a vector search query."""
	query_embedding = get_embedding(query)
	pipeline = [
	{
	"$vectorSearch": {
	"index": "vector_index",
	"queryVector": query_embedding,
	"path": "embedding",
	"exact": True,
	"limit": 5,
	"filter": {
	"organization_id": org_id
	}
	}
	}, {
	"$project": {
	"_id": 0,
	"text": 1
	}
	}
	]

	results = collection.aggregate(pipeline=pipeline)
	array_of_results = []
	for doc in results:
	array_of_results.append(doc)
	return array_of_results

	def extract_text_from_doctr(result):
	json_export = result.export()
	text = ""
	for page in json_export["pages"]:
	for block in page["blocks"]:
	for line in block["lines"]:
	text += " ".join([w["value"] for w in line["words"]]) + "\n"
	return text


	@app.route("/upload", methods=["POST"])
	def upload_file():
	organization_id = request.form.get("organization_id")
	file = request.files.get("file")

	if not file or not organization_id:
	return jsonify({"error": "Missing file or organization_id"}), 400

	contents = file.read()
	doc = fitz.open(stream=contents, filetype="pdf")
	print(doc)
	text = ""
	# Here the case 1
	if file.filename.lower().endswith(".pdf"):
	for page in doc:
	text += page.get_text()

	if text.strip() == "":
	# Here I will use OCR
	ocr_doc = DocumentFile.from_pdf(io.BytesIO(contents))
	result = ocr_model(ocr_doc)
	text = extract_text_from_doctr(result)

	else:
	pdf_bytes = img2pdf.convert(contents)
	ocr_doc = DocumentFile.from_pdf(io.BytesIO(pdf_bytes))
	result = ocr_model(ocr_doc)
	text = extract_text_from_doctr(result)

	print(text)
	# return text

	doc_obj = [Document(page_content=text)]
	documents = getChunks(doc_obj, 400, 20)


	# print(documents)
	docs_to_insert = [{
	"organization_id": organization_id, # app-write id
	"text": d.page_content,
	"embedding": get_embedding(d.page_content)
	} for d in documents]

	collection.insert_many(docs_to_insert)
	index_name="vector_index"
	search_index_model = SearchIndexModel(
	definition = {
	"fields": [
	{
	"type": "vector",
	"numDimensions": 3072,
	"path": "embedding",
	"similarity": "cosine"
	},
	{
	"type": "filter",
	"path": "organization_id"
	}
	]
	},
	name = index_name,
	type = "vectorSearch"
	)
	# collection.create_search_index(model=search_index_model)

	try:
	collection.create_search_index(model=search_index_model)
	except Exception:
	pass
	return {"message": "File uploaded successfully"}

	@app.route("/query", methods=["GET"])
	def query():
	organization_id = request.args.get("organization_id")
	question = request.args.get("question")

	context_docs = get_query_results(organization_id, question)
	context_string = " ".join([doc["text"] for doc in context_docs])
	prompt = f"""Use the following pieces of context to answer the question at the end.
	{context_string}
	Question: {question}
	"""
	response = genai_client.models.generate_content(model='gemini-2.5-flash', contents=prompt)
	return jsonify({"answer": response.text})

	if __name__ == "__main__":
	app.run(host="0.0.0.0", port=7860)