Chaitaniya commited on
Commit
4810f6f
·
1 Parent(s): 897015c
Files changed (9) hide show
  1. .gitignore +25 -0
  2. Dockerfile +21 -0
  3. app/main.py +28 -0
  4. app/pdf_handler.py +14 -0
  5. app/rag_pipeline.py +85 -0
  6. app/test.py +21 -0
  7. app/vector_store.py +17 -0
  8. requirements.txt +10 -0
  9. ui/ui_app.py +35 -0
.gitignore ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache and environment
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ .env
7
+ .venv/
8
+
9
+ # Data folders (PDFs, ChromaDB, etc.)
10
+ data/*
11
+ !data/__init__.py
12
+
13
+ # Large file types
14
+ *.pdf
15
+ *.pt
16
+ *.bin
17
+
18
+ # Streamlit UI cache
19
+ .ui/
20
+ .streamlit/
21
+
22
+ # OS/IDE junk
23
+ .DS_Store
24
+ *.swp
25
+ .vscode/
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FROM python:3.13.0
2
+
3
+ # WORKDIR /app
4
+ # COPY . /app
5
+
6
+ # RUN pip install --upgrade pip
7
+ # RUN pip install -r requirements.txt
8
+
9
+ # CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
10
+
11
+ FROM python:3.13.0
12
+
13
+ WORKDIR /app
14
+ COPY . .
15
+
16
+ RUN apt-get update && apt-get install -y git
17
+ RUN pip install --upgrade pip
18
+ RUN pip install -r requirements.txt
19
+
20
+ EXPOSE 7860
21
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
app/main.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from fastapi import FastAPI, UploadFile, File
3
+ from app.pdf_handler import process_pdf
4
+ from app.rag_pipeline import generate_answer
5
+ from app.test import query_test
6
+
7
+
8
+ app = FastAPI()
9
+
10
+
11
+ class ChatRequest(BaseModel):
12
+ query: str
13
+ top_k: int = 3
14
+
15
+
16
+ @app.post("/chat/")
17
+ async def chat(request: ChatRequest):
18
+ return generate_answer(request.query, request.top_k)
19
+
20
+
21
+ @app.post("/upload/")
22
+ async def upload_pdf(file: UploadFile = File(...)):
23
+ return process_pdf(file)
24
+
25
+
26
+ @app.post("/test/")
27
+ async def chat(query: str, top_k: int = 3):
28
+ return query_test(query)
app/pdf_handler.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from fastapi import UploadFile
3
+ from app.vector_store import store_pdf
4
+
5
+ UPLOAD_DIR = "data/uploaded_pdfs"
6
+
7
+
8
+ def process_pdf(file: UploadFile):
9
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
10
+ filepath = os.path.join(UPLOAD_DIR, file.filename)
11
+ with open(filepath, "wb") as f:
12
+ f.write(file.file.read())
13
+ store_pdf(filepath)
14
+ return {"status": "uploaded", "filename": file.filename}
app/rag_pipeline.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import requests
4
+ from langchain.vectorstores import Chroma
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain.llms import HuggingFacePipeline
7
+ from langchain.vectorstores import Chroma
8
+ from langchain.embeddings import HuggingFaceEmbeddings
9
+ from langchain.embeddings import SentenceTransformerEmbeddings
10
+
11
+ CHROMA_DIR = "data/chroma_db"
12
+
13
+ load_dotenv()
14
+ HUGGINGFACE_API_KEY = os.getenv("HF_API_KEY") # set this in .env or directly
15
+
16
+ qna_system_message = """
17
+ You are an assistant whose work is to review the report and provide the appropriate answers from the context.
18
+ User input will have the context required by you to answer user questions.
19
+ This context will begin with the token: ###Context.
20
+ The context contains references to specific portions of a document relevant to the user query.
21
+
22
+ User questions will begin with the token: ###Question.
23
+
24
+ Please answer only using the context provided in the input. Do not mention anything about the context in your final answer.
25
+
26
+ If the answer is not found in the context, respond "I don't know".
27
+ """
28
+
29
+ qna_user_message_template = """
30
+ ###Context
31
+ Here are some documents that are relevant to the question mentioned below.
32
+ {context}
33
+
34
+ ###Question
35
+ {question}
36
+ """
37
+
38
+
39
+ def call_huggingface_mistral(prompt: str):
40
+
41
+ api_url = "https://router.huggingface.co/featherless-ai/v1/chat/completions"
42
+ headers = {"Authorization": f"Bearer {HUGGINGFACE_API_KEY}"}
43
+ payload = {
44
+ "messages": [
45
+ {
46
+ "role": "user",
47
+ "content": prompt
48
+ }
49
+ ],
50
+ "model": 'mistralai/Mistral-7B-Instruct-v0.2'
51
+ }
52
+
53
+ response = requests.post(api_url, headers=headers, json=payload)
54
+ if response.status_code != 200:
55
+ return f"[Error {response.status_code}] {response.text}"
56
+
57
+ return response.json()["choices"][0]["message"]
58
+
59
+
60
+ def generate_answer(query, top_k=3):
61
+
62
+ CHROMA_DIR = "data/chroma_db"
63
+ embeddings = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')
64
+
65
+ db = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
66
+ retriever = db.as_retriever(
67
+ search_type='similarity',
68
+ search_kwargs={'k': 4}
69
+ )
70
+ relevant_document_chunks = retriever.get_relevant_documents(
71
+ query=query, k=top_k)
72
+ context_list = [d.page_content for d in relevant_document_chunks]
73
+ print(f'context_list: {context_list}')
74
+
75
+ # Combine document chunks into a single context
76
+ context = ". ".join(context_list)
77
+ user_message = qna_user_message_template.replace(
78
+ '{context}', context)
79
+ user_message = user_message.replace('{question}', query)
80
+
81
+ prompt = qna_system_message + '\n' + user_message
82
+ print(f'Prompt: {prompt}')
83
+
84
+ answer = call_huggingface_mistral(prompt)
85
+ return {"answer": answer}
app/test.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import requests
4
+ API_URL = "https://router.huggingface.co/featherless-ai/v1/chat/completions"
5
+ headers = {
6
+ "Authorization": f"Bearer {os.environ['HF_API_KEY']}",
7
+ }
8
+
9
+
10
+ def query_test(payload):
11
+ payload = {
12
+ "messages": [
13
+ {
14
+ "role": "user",
15
+ "content": "What is the capital of France?"
16
+ }
17
+ ],
18
+ "model": "mistralai/Mistral-7B-Instruct-v0.2"
19
+ }
20
+ response = requests.post(API_URL, headers=headers, json=payload)
21
+ return response.json()["choices"][0]["message"]
app/vector_store.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain.vectorstores import Chroma
3
+ from langchain.embeddings import SentenceTransformerEmbeddings
4
+ from langchain.document_loaders import PyMuPDFLoader
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+
7
+ CHROMA_DIR = "data/chroma_db"
8
+
9
+
10
+ def store_pdf(pdf_path):
11
+ loader = PyMuPDFLoader(pdf_path)
12
+ docs = loader.load()
13
+ splitter = RecursiveCharacterTextSplitter(
14
+ chunk_size=500, chunk_overlap=100)
15
+ chunks = splitter.split_documents(docs)
16
+ embeddings = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')
17
+ Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DIR)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ langchain
4
+ chromadb
5
+ transformers
6
+ sentence-transformers
7
+ PyMuPDF
8
+ langchain-community
9
+ python-multipart
10
+ streamlit
ui/ui_app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+
4
+ API_CHAT_URL = "http://localhost:8000/chat/"
5
+ API_UPLOAD_URL = "http://localhost:8000/upload/"
6
+
7
+ st.title("📚 HelpDevelopers RAG Chatbot")
8
+
9
+ # PDF Upload Section
10
+ st.header("📤 Upload a PDF")
11
+ uploaded_file = st.file_uploader("Choose a PDF to upload", type="pdf")
12
+
13
+ if uploaded_file is not None:
14
+ if st.button("Upload"):
15
+ with st.spinner("Uploading..."):
16
+ files = {"file": (uploaded_file.name,
17
+ uploaded_file, "application/pdf")}
18
+ res = requests.post(API_UPLOAD_URL, files=files)
19
+ if res.status_code == 200:
20
+ st.success(f"{uploaded_file.name} uploaded successfully!")
21
+ else:
22
+ st.error("Upload failed.")
23
+
24
+ # Chat Section
25
+ st.header("💬 Ask a Question")
26
+ query = st.text_input("Your question:")
27
+ if st.button("Submit"):
28
+ if query:
29
+ with st.spinner("Thinking..."):
30
+ res = requests.post(API_CHAT_URL, json={
31
+ "query": query, "top_k": 3})
32
+ if res.status_code == 200:
33
+ st.success(res.json().get("answer", "No response."))
34
+ else:
35
+ st.error(f"Error: {res.text}")