Subhajit Chakraborty commited on
Commit
3daa0bb
·
1 Parent(s): 3d8d387
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +165 -0
  3. requirements.txt +9 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from annotated_types import doc
2
+ from fastapi import FastAPI, UploadFile, File, Query, Form
3
+ from pymongo import MongoClient
4
+ from pymongo.collection import Collection
5
+ from pymongo import ASCENDING
6
+ from pymongo.operations import SearchIndexModel
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ import fitz
9
+ from dotenv import load_dotenv
10
+ # import numpy as np
11
+ # import pytesseract
12
+ # from pdf2image import convert_from_bytes
13
+ import img2pdf
14
+ # from PIL import Image
15
+ from google import genai
16
+ # import time
17
+ import io
18
+ import os
19
+ from doctr.io import DocumentFile
20
+ from doctr.models import ocr_predictor
21
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
22
+ from langchain.docstore.document import Document
23
+
24
+ ocr_model = ocr_predictor(pretrained=True)
25
+
26
+ app = FastAPI()
27
+ origins = ["*"]
28
+ app.add_middleware(
29
+ CORSMiddleware,
30
+ allow_origins=origins,
31
+ )
32
+ load_dotenv()
33
+
34
+ client = MongoClient(os.getenv("MONGO_URI"))
35
+ db = client["NaviQ"]
36
+ collection: Collection = db["rag_db"]
37
+ collection.create_index([("organization_id", ASCENDING)])
38
+
39
+ # The embedding model
40
+ api_key = os.getenv("GEMINI_API_KEY")
41
+ genai_client = genai.Client(api_key=api_key)
42
+
43
+ def get_embedding(data):
44
+ """Generates vector embeddings for the given data."""
45
+ result = genai_client.models.embed_content( model="gemini-embedding-001", contents=data)
46
+ return result.embeddings[0].values
47
+
48
+ def getChunks(text, chunk_size, overlap):
49
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
50
+ return text_splitter.split_documents(text)
51
+
52
+ def get_query_results(org_id, query):
53
+ """Gets results from a vector search query."""
54
+ query_embedding = get_embedding(query)
55
+ pipeline = [
56
+ {
57
+ "$vectorSearch": {
58
+ "index": "vector_index",
59
+ "queryVector": query_embedding,
60
+ "path": "embedding",
61
+ "exact": True,
62
+ "limit": 5,
63
+ "filter": {
64
+ "organization_id": org_id
65
+ }
66
+ }
67
+ }, {
68
+ "$project": {
69
+ "_id": 0,
70
+ "text": 1
71
+ }
72
+ }
73
+ ]
74
+
75
+ results = collection.aggregate(pipeline=pipeline)
76
+ array_of_results = []
77
+ for doc in results:
78
+ array_of_results.append(doc)
79
+ return array_of_results
80
+
81
+ def extract_text_from_doctr(result):
82
+ json_export = result.export()
83
+ text = ""
84
+ for page in json_export["pages"]:
85
+ for block in page["blocks"]:
86
+ for line in block["lines"]:
87
+ text += " ".join([w["value"] for w in line["words"]]) + "\n"
88
+ return text
89
+
90
+
91
+ @app.post("/upload")
92
+ async def upload_file(organization_id: str = Form(...), file: UploadFile = File(...)):
93
+ contents = await file.read()
94
+ doc = fitz.open(stream=contents, filetype="pdf")
95
+ print(doc)
96
+ text = ""
97
+ # Here the case 1
98
+ if file.filename.lower().endswith(".pdf"):
99
+ for page in doc:
100
+ text += page.get_text()
101
+
102
+ if text.strip() == "":
103
+ # Here I will use OCR
104
+ ocr_doc = DocumentFile.from_pdf(io.BytesIO(contents))
105
+ result = ocr_model(ocr_doc)
106
+ text = extract_text_from_doctr(result)
107
+
108
+ else:
109
+ pdf_bytes = img2pdf.convert(contents)
110
+ ocr_doc = DocumentFile.from_pdf(io.BytesIO(pdf_bytes))
111
+ result = ocr_model(ocr_doc)
112
+ text = extract_text_from_doctr(result)
113
+
114
+ print(text)
115
+ # return text
116
+
117
+ doc_obj = [Document(page_content=text)]
118
+ documents = getChunks(doc_obj, 400, 20)
119
+
120
+
121
+ # print(documents)
122
+ docs_to_insert = [{
123
+ "organization_id": organization_id, # app-write id
124
+ "text": d.page_content,
125
+ "embedding": get_embedding(d.page_content)
126
+ } for d in documents]
127
+
128
+ collection.insert_many(docs_to_insert)
129
+ index_name="vector_index"
130
+ search_index_model = SearchIndexModel(
131
+ definition = {
132
+ "fields": [
133
+ {
134
+ "type": "vector",
135
+ "numDimensions": 3072,
136
+ "path": "embedding",
137
+ "similarity": "cosine"
138
+ },
139
+ {
140
+ "type": "filter",
141
+ "path": "organization_id"
142
+ }
143
+ ]
144
+ },
145
+ name = index_name,
146
+ type = "vectorSearch"
147
+ )
148
+ # collection.create_search_index(model=search_index_model)
149
+
150
+ try:
151
+ collection.create_search_index(model=search_index_model)
152
+ except Exception:
153
+ pass
154
+ return {"message": "File uploaded successfully"}
155
+
156
+ @app.get("/query")
157
+ async def query(organization_id: str = Query(...), question: str = Query(...)):
158
+ context_docs = get_query_results(organization_id, question)
159
+ context_string = " ".join([doc["text"] for doc in context_docs])
160
+ prompt = f"""Use the following pieces of context to answer the question at the end.
161
+ {context_string}
162
+ Question: {question}
163
+ """
164
+ response = genai_client.models.generate_content(model='gemini-2.5-flash', contents=prompt)
165
+ return response.text
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ pymongo
3
+ python-dotenv
4
+ python-doctr
5
+ langchain
6
+ pymupdf
7
+ img2pdf
8
+ google-genai
9
+ uvicorn