ashishbangwal commited on
Commit
f1d1d20
·
1 Parent(s): 3057b47
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.db
2
+ .env
3
+ __pycache__
4
+ /chromadb/*
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Copy the current directory contents into the container at /app
7
+ COPY requirements.txt /app
8
+
9
+ # Install any needed packages specified in requirements.txt
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Create a non-root user
13
+ RUN useradd -m appuser
14
+
15
+ # Create necessary directories and set permissions
16
+ RUN mkdir -p /app/data && \
17
+ chown -R appuser:appuser /app && \
18
+ chmod -R 755 /app
19
+
20
+ COPY . /app
21
+
22
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from fastapi import FastAPI, Form, UploadFile, File
3
+ from data_ingetion.data import AdvancedDatabase
4
+ from data_ingetion.pre_processor import read_metadata
5
+ from generator.response import generate_response
6
+
7
+ db = AdvancedDatabase()
8
+ app = FastAPI()
9
+
10
+
11
+ @app.post("/ingest_files")
12
+ async def ingest_files(
13
+ metadata: str = Form(...),
14
+ group_name: str = Form(...),
15
+ files: List[UploadFile] = File(...),
16
+ ):
17
+ """Upload and process multiple files (PDFs and DOCXs) in single endpoint inference asyncronously"""
18
+ result = await db.ingest(files, metadata, group_name)
19
+ return result
20
+
21
+
22
+ @app.post("/fetch_response")
23
+ async def get_response(query: str, group_name: str, include_chunks: bool = False):
24
+ """Invoke RAG pipeline with given collection as VectorDB to retrieve context and generate context rich responses"""
25
+ response = generate_response(query, group_name)
26
+ if include_chunks:
27
+ return {"llm_response": response[0], "chunks": response[1]}
28
+ return {"llm_response": response}
29
+
30
+
31
+ @app.get("/get_metadata")
32
+ async def get_metadata():
33
+ """Fetch Metadata of recently uploaded documents grouped by collections ie. time they were uploaded."""
34
+ response = read_metadata()
35
+ return {"Metadata": response}
data_ingetion/data.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from uuid import uuid4
3
+
4
+ from .pre_processor import full_processor
5
+
6
+ from typing import List
7
+ from fastapi import UploadFile
8
+
9
+ DB_NAME = "chromadb"
10
+
11
+
12
+ class AdvancedDatabase:
13
+ def __init__(self) -> None:
14
+ self.client = chromadb.PersistentClient(path=DB_NAME)
15
+
16
+ async def ingest(self, files: List[UploadFile], user_metadata, collection_name):
17
+ chunks, metadata, embeddings = await full_processor(
18
+ files, user_metadata, collection_name
19
+ )
20
+
21
+ collection = self.client.create_collection(name=collection_name)
22
+
23
+ collection.add(
24
+ ids=[str(uuid4()) for _ in range(len(chunks))],
25
+ embeddings=embeddings,
26
+ documents=chunks,
27
+ metadatas=metadata,
28
+ )
29
+
30
+ return {"chunks_added": len(chunks), "collection_name": collection_name}
31
+
32
+ def get_context(self, embedding: List[float], group_name: str):
33
+ collection = self.client.get_collection(name=group_name)
34
+
35
+ response = collection.query(
36
+ query_embeddings=embedding, n_results=5, include=["documents"]
37
+ )
38
+
39
+ return response["documents"][0]
data_ingetion/pre_processor.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docx import Document
2
+ from pypdf import PdfReader
3
+ import datetime
4
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
5
+ from openai import OpenAI
6
+ import sqlite3
7
+ import json
8
+
9
+ import os
10
+ from io import BytesIO
11
+
12
+ from typing import List
13
+ from fastapi import UploadFile
14
+
15
+ splitter = RecursiveCharacterTextSplitter(chunk_size=225, chunk_overlap=64)
16
+ EMBEDDING_MODEL = "togethercomputer/m2-bert-80M-2k-retrieval"
17
+ api_key = (
18
+ os.getenv("TOGETHER_API")
19
+ or "0d1849365485f54f5deb32458276cb348948608da5a89dad0efc780c2d356916"
20
+ )
21
+ ai_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
22
+
23
+ # Setup/Initiate SQLite database for metadata store
24
+ conn = sqlite3.connect("metadata.db")
25
+ cursor = conn.cursor()
26
+ cursor.execute(
27
+ """
28
+ CREATE TABLE IF NOT EXISTS metadata_store (
29
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
30
+ meta_dict TEXT NOT NULL
31
+ )
32
+ """
33
+ )
34
+ conn.commit()
35
+
36
+
37
+ async def full_processor(files: List[UploadFile], user_tags: str, collection_name: str):
38
+ user_metadata = {"tags": user_tags}
39
+
40
+ file_names = [str(file.filename) for file in files]
41
+ file_types = [name.split(".")[-1] for name in file_names]
42
+
43
+ processed_docs = await processor(files, file_types)
44
+
45
+ chunks, metadata = create_chunks(processed_docs, file_names, user_metadata)
46
+
47
+ response = ai_client.embeddings.create(input=chunks, model=EMBEDDING_MODEL)
48
+ embeddings = [item.embedding for item in response.data]
49
+
50
+ write_metadata(file_names, file_types, user_tags, len(chunks), collection_name)
51
+
52
+ return (chunks, metadata, embeddings)
53
+
54
+
55
+ async def processor(files: List[UploadFile], file_types: List[str]) -> List[str]:
56
+ processed_docs = []
57
+ for idx, file_type in enumerate(file_types):
58
+ if file_type == "pdf":
59
+ processed_docs.append(await process_pdf(files[idx]))
60
+ elif file_type == "docx":
61
+ processed_docs.append(await process_docx(files[idx]))
62
+
63
+ return processed_docs
64
+
65
+
66
+ async def process_pdf(file: UploadFile) -> str:
67
+ data = await file.read()
68
+ doc = PdfReader(BytesIO(data))
69
+
70
+ text = ""
71
+ for page in doc.pages:
72
+ text += page.extract_text() or ""
73
+
74
+ return text
75
+
76
+
77
+ async def process_docx(file: UploadFile) -> str:
78
+ data = await file.read()
79
+ doc = Document(BytesIO(data))
80
+
81
+ text = ""
82
+ for para in doc.paragraphs:
83
+ text += para.text + "\n"
84
+
85
+ return text
86
+
87
+
88
+ def create_chunks(docs: List[str], file_names: List[str], user_metadata):
89
+ all_chunks = []
90
+ all_metadata = []
91
+
92
+ dateTime = datetime.datetime.now().strftime("%d/%m/%y-%I")
93
+
94
+ for idx, doc in enumerate(docs):
95
+ chunks = splitter.split_text(doc)
96
+ all_chunks += chunks
97
+ metadata = {
98
+ "file_name": file_names[idx],
99
+ "timestamp": dateTime,
100
+ **user_metadata,
101
+ }
102
+ all_metadata += [metadata] * len(chunks)
103
+
104
+ return all_chunks, all_metadata
105
+
106
+
107
+ def write_metadata(file_names, file_types, user_tags, total_chunks, collection_name):
108
+
109
+ time = datetime.datetime.now().strftime("%d/%m/%y-%I")
110
+ pdf, docx, unsupported = 0, 0, 0
111
+ for i in file_types:
112
+ if i == "pdf":
113
+ pdf += 1
114
+ elif i == "docx":
115
+ docx += 1
116
+ else:
117
+ unsupported += 1
118
+
119
+ metadata = {
120
+ "Collection": collection_name,
121
+ "FileName": file_names,
122
+ "TotalPDF": pdf,
123
+ "TotalDocx": docx,
124
+ "Unsupported": unsupported,
125
+ "CustomTag": user_tags,
126
+ "TotalChunks": total_chunks,
127
+ "Time": time,
128
+ }
129
+ cursor.execute(
130
+ "INSERT INTO metadata_store (meta_dict) VALUES (?)", (json.dumps(metadata),)
131
+ )
132
+ conn.commit()
133
+
134
+
135
+ def read_metadata():
136
+
137
+ cursor.execute("SELECT meta_dict FROM metadata_store")
138
+ rows = cursor.fetchall()
139
+
140
+ data = []
141
+ for row in rows:
142
+ meta = json.loads(row[0])
143
+ data.append(meta)
144
+
145
+ return data
generator/prompts.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ SYS_PROMPT = """
2
+ You are an AI assistant powered by a retrieval-augmented generation (RAG) system. You are provided with CONTEXT—semantically relevant information retrieved from a private knowledge base—based on the USER_QUERY.
3
+ Your task is to generate a helpful, accurate, and context-aware response by combining your own knowledge with the retrieved CONTEXT. Always prioritize CONTEXT when it is available, but supplement it with your own understanding when appropriate. Ensure your response is clear, concise, and directly addresses the user's query.
4
+ """
generator/response.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ from typing import List
3
+ import os
4
+
5
+ from .prompts import SYS_PROMPT
6
+ from data_ingetion.data import AdvancedDatabase
7
+
8
+ api_key = (
9
+ os.getenv("TOGETHER_API")
10
+ or "0d1849365485f54f5deb32458276cb348948608da5a89dad0efc780c2d356916"
11
+ )
12
+ client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
13
+ EMBEDDING_MODEL = "togethercomputer/m2-bert-80M-2k-retrieval"
14
+
15
+
16
+ def generate_response(query: str, group_name: str, return_chunks: bool = True):
17
+ db = AdvancedDatabase()
18
+ query_embedding = get_embedding(query)
19
+ context = db.get_context(query_embedding, group_name)
20
+ response = llm_response(context, query)
21
+ if return_chunks:
22
+ return (response, context)
23
+ return response
24
+
25
+
26
+ def llm_response(context: List[str], user_query: str, history={}, stream: bool = False):
27
+ response = client.chat.completions.create(
28
+ model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
29
+ messages=[
30
+ {
31
+ "role": "system",
32
+ "content": SYS_PROMPT,
33
+ },
34
+ *history,
35
+ {
36
+ "role": "user",
37
+ "content": f"Query : {user_query} \n\n Context: {context}",
38
+ },
39
+ ],
40
+ stream=False,
41
+ )
42
+
43
+ # Feature to add streaming response
44
+ # if stream:
45
+ # for chunk in response:
46
+ # yield chunk.choices[0].delta.content or ""
47
+ # else:
48
+ return response.choices[0].message.content
49
+
50
+
51
+ def get_embedding(query: str):
52
+ response = client.embeddings.create(input=query, model=EMBEDDING_MODEL)
53
+ embeddings = response.data[0].embedding
54
+ return embeddings
requirements.txt ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.7.0
2
+ anyio==4.9.0
3
+ asgiref==3.8.1
4
+ attrs==25.3.0
5
+ backoff==2.2.1
6
+ bcrypt==4.3.0
7
+ build==1.2.2.post1
8
+ cachetools==5.5.2
9
+ certifi==2025.4.26
10
+ charset-normalizer==3.4.2
11
+ chromadb==1.0.12
12
+ click==8.2.1
13
+ coloredlogs==15.0.1
14
+ distro==1.9.0
15
+ dnspython==2.7.0
16
+ durationpy==0.10
17
+ email_validator==2.2.0
18
+ exceptiongroup==1.3.0
19
+ fastapi==0.115.9
20
+ fastapi-cli==0.0.7
21
+ filelock==3.18.0
22
+ flatbuffers==25.2.10
23
+ fsspec==2025.5.1
24
+ google-auth==2.40.3
25
+ googleapis-common-protos==1.70.0
26
+ grpcio==1.72.1
27
+ h11==0.16.0
28
+ hf-xet==1.1.3
29
+ httpcore==1.0.9
30
+ httptools==0.6.4
31
+ httpx==0.28.1
32
+ huggingface-hub==0.32.4
33
+ humanfriendly==10.0
34
+ idna==3.10
35
+ importlib_metadata==8.7.0
36
+ importlib_resources==6.5.2
37
+ Jinja2==3.1.6
38
+ jiter==0.10.0
39
+ jsonpatch==1.33
40
+ jsonpointer==3.0.0
41
+ jsonschema==4.24.0
42
+ jsonschema-specifications==2025.4.1
43
+ kubernetes==32.0.1
44
+ langchain-core==0.3.64
45
+ langchain-text-splitters==0.3.8
46
+ langsmith==0.3.45
47
+ lxml==5.4.0
48
+ markdown-it-py==3.0.0
49
+ MarkupSafe==3.0.2
50
+ mdurl==0.1.2
51
+ mmh3==5.1.0
52
+ mpmath==1.3.0
53
+ numpy==2.2.6
54
+ oauthlib==3.2.2
55
+ onnxruntime==1.22.0
56
+ openai==1.84.0
57
+ opentelemetry-api==1.34.0
58
+ opentelemetry-exporter-otlp-proto-common==1.34.0
59
+ opentelemetry-exporter-otlp-proto-grpc==1.34.0
60
+ opentelemetry-instrumentation==0.55b0
61
+ opentelemetry-instrumentation-asgi==0.55b0
62
+ opentelemetry-instrumentation-fastapi==0.55b0
63
+ opentelemetry-proto==1.34.0
64
+ opentelemetry-sdk==1.34.0
65
+ opentelemetry-semantic-conventions==0.55b0
66
+ opentelemetry-util-http==0.55b0
67
+ orjson==3.10.18
68
+ overrides==7.7.0
69
+ packaging==24.2
70
+ posthog==4.2.0
71
+ protobuf==5.29.5
72
+ pyasn1==0.6.1
73
+ pyasn1_modules==0.4.2
74
+ pydantic==2.11.5
75
+ pydantic_core==2.33.2
76
+ Pygments==2.19.1
77
+ pypdf==5.6.0
78
+ PyPika==0.48.9
79
+ pyproject_hooks==1.2.0
80
+ python-dateutil==2.9.0.post0
81
+ python-docx==1.1.2
82
+ python-dotenv==1.1.0
83
+ python-multipart==0.0.20
84
+ PyYAML==6.0.2
85
+ referencing==0.36.2
86
+ requests==2.32.3
87
+ requests-oauthlib==2.0.0
88
+ requests-toolbelt==1.0.0
89
+ rich==14.0.0
90
+ rich-toolkit==0.14.7
91
+ rpds-py==0.25.1
92
+ rsa==4.9.1
93
+ shellingham==1.5.4
94
+ six==1.17.0
95
+ sniffio==1.3.1
96
+ starlette==0.45.3
97
+ sympy==1.14.0
98
+ tenacity==9.1.2
99
+ tokenizers==0.21.1
100
+ tomli==2.2.1
101
+ tqdm==4.67.1
102
+ typer==0.16.0
103
+ typing-inspection==0.4.1
104
+ typing_extensions==4.14.0
105
+ urllib3==2.4.0
106
+ uvicorn==0.34.3
107
+ uvloop==0.21.0
108
+ watchfiles==1.0.5
109
+ websocket-client==1.8.0
110
+ websockets==15.0.1
111
+ wrapt==1.17.2
112
+ zipp==3.22.0
113
+ zstandard==0.23.0
test/sample1.pdf ADDED
Binary file (82.1 kB). View file
 
test/unit_test.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from uuid import uuid4
3
+
4
+ base_url = "http://localhost:7860"
5
+ ingest_endpoint = "/ingest_files"
6
+ generator_endpoint = "/fetch_response"
7
+ get_metadata_endpoint = "/get_metadata"
8
+
9
+ collection_name = str(uuid4())
10
+ testcase_passed = 0
11
+ total_tests = 3
12
+
13
+
14
+ def test_ingest_files():
15
+ global testcase_passed
16
+ metadata = {"metadata": "legal,nda,confidential", "group_name": collection_name}
17
+ with open("test/sample1.pdf", "rb") as f:
18
+ files = [("files", ("sample1.pdf", f, "application/pdf"))]
19
+ response = requests.post(base_url + ingest_endpoint, files=files, data=metadata)
20
+ print("Testing:", ingest_endpoint)
21
+ print("Status:", response.status_code)
22
+ print("Response:", response.content)
23
+ assert response.status_code == 200, "Ingest endpoint failed"
24
+ testcase_passed += 1
25
+
26
+
27
+ def test_llm_generation():
28
+ global testcase_passed
29
+ params = {"query": "How was past year performance", "group_name": collection_name}
30
+ response = requests.post(base_url + generator_endpoint, params=params)
31
+ print("Testing:", generator_endpoint)
32
+ print("Status:", response.status_code)
33
+ print("Response:", response.content)
34
+ assert response.status_code == 200, "LLM generation endpoint failed"
35
+ testcase_passed += 1
36
+
37
+
38
+ def test_get_metadata():
39
+ global testcase_passed
40
+ response = requests.get(base_url + get_metadata_endpoint)
41
+ print("Testing:", get_metadata_endpoint)
42
+ print("Status:", response.status_code)
43
+ print("Response:", response.content)
44
+ assert response.status_code == 200, "Metadata endpoint failed"
45
+ testcase_passed += 1
46
+
47
+
48
+ if __name__ == "__main__":
49
+ try:
50
+ test_ingest_files()
51
+ test_llm_generation()
52
+ test_get_metadata()
53
+ except AssertionError as e:
54
+ print("Test failed:", e)
55
+
56
+ print(f"{testcase_passed}/{total_tests} tests passed")