samagra44 commited on
Commit
6e357ca
·
1 Parent(s): 6067c02

initial commit

Browse files
.gitignore ADDED
Binary file (52 Bytes). View file
 
Dockerfile CHANGED
@@ -1,21 +1,14 @@
1
- FROM python:3.9-slim
2
 
3
  WORKDIR /app
4
 
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- software-properties-common \
9
- git \
10
- && rm -rf /var/lib/apt/lists/*
11
 
12
- COPY requirements.txt ./
13
- COPY src/ ./src/
14
 
15
- RUN pip3 install -r requirements.txt
16
 
17
- EXPOSE 8501
18
 
19
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
 
21
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
1
+ FROM python:3.10
2
 
3
  WORKDIR /app
4
 
5
+ COPY requirements.txt .
 
 
 
 
 
6
 
7
+ RUN pip install --no-cache-dir -r requirements.txt
 
8
 
9
+ COPY . .
10
 
11
+ RUN chmod +x
12
 
13
+ CMD ["./start.sh"]
14
 
 
README.md CHANGED
@@ -1,4 +1,3 @@
1
- ---
2
  title: My Doc Rag
3
  emoji: 🚀
4
  colorFrom: red
@@ -6,14 +5,6 @@ colorTo: red
6
  sdk: docker
7
  app_port: 8501
8
  tags:
9
- - streamlit
10
  pinned: false
11
- short_description: Streamlit template space
12
- ---
13
-
14
- # Welcome to Streamlit!
15
-
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
-
18
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
- forums](https://discuss.streamlit.io).
 
 
1
  title: My Doc Rag
2
  emoji: 🚀
3
  colorFrom: red
 
5
  sdk: docker
6
  app_port: 8501
7
  tags:
8
+ - streamlit
9
  pinned: false
10
+ short_description: Streamlit template space
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from frontend.chat_gui import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
config/__init__.py ADDED
File without changes
config/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (128 Bytes). View file
 
config/__pycache__/nodes.cpython-310.pyc ADDED
Binary file (489 Bytes). View file
 
config/nodes.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
7
+
8
+ HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
9
+
10
+ LLM_MODEL_NAME = os.getenv("LLM_MODEL_NAME")
11
+
12
+ BASE_URL = os.getenv("BASE_URL")
13
+
14
+ LLM_TEMPERATURE_SET = os.getenv("LLM_TEMPERATURE_SET")
15
+
16
+ EMBEDDING_MODEL_NAME = os.getenv("EMBEDDING_MODEL_NAME")
17
+
18
+ EMBEDDING_MODEL_TASK = os.getenv("EMBEDDING_MODEL_TASK")
19
+
20
+ PERSIST_DIRECTORY = os.getenv("PERSIST_DIRECTORY")
database/__init__.py ADDED
File without changes
database/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (130 Bytes). View file
 
database/__pycache__/create_database.cpython-310.pyc ADDED
Binary file (641 Bytes). View file
 
database/__pycache__/load_database.cpython-310.pyc ADDED
Binary file (689 Bytes). View file
 
database/create_database.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_chroma import Chroma
2
+ from utils.load_llm import TEXT_SPLITTER, EMBEDDING_MODEL
3
+ from config.nodes import PERSIST_DIRECTORY
4
+
5
+ def create_or_add_to_collection(collection_name: str, docs):
6
+ texts = TEXT_SPLITTER.split_documents(docs)
7
+ database_chroma = Chroma(
8
+ collection_name=collection_name,
9
+ persist_directory=PERSIST_DIRECTORY,
10
+ embedding_function=EMBEDDING_MODEL
11
+ )
12
+ database_chroma.add_documents(texts)
13
+ return database_chroma
database/load_database.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_chroma import Chroma
2
+ from utils.load_llm import TEXT_SPLITTER, EMBEDDING_MODEL
3
+ from config.nodes import PERSIST_DIRECTORY
4
+
5
+
6
+ def load_retriever(collection_name: str, score_threshold=0.6):
7
+ load_database_chroma = Chroma(
8
+ collection_name=collection_name,
9
+ persist_directory=PERSIST_DIRECTORY,
10
+ embedding_function=EMBEDDING_MODEL
11
+ )
12
+ return load_database_chroma.as_retriever(
13
+ search_type="similarity_score_threshold",
14
+ search_kwargs={"score_threshold": score_threshold}
15
+ )
frontend/__pycache__/chat_gui.cpython-310.pyc ADDED
Binary file (1.83 kB). View file
 
frontend/chat_gui.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import streamlit as st
3
+
4
+ API_URL = "http://localhost:8000"
5
+
6
+ def main():
7
+ st.set_page_config(page_title="📚 Cat Assistant", layout="wide")
8
+ st.title("📚 Chat Assistant")
9
+
10
+ if "messages" not in st.session_state:
11
+ st.session_state.messages = []
12
+
13
+ collection_name = st.text_input("Name Your Collection", value="my_collection")
14
+
15
+ with st.expander("📤 Upload Documents or URL"):
16
+ upload_type = st.radio("Choose upload type", ["File", "Web URL"], horizontal=True)
17
+
18
+ if upload_type == "File":
19
+ uploaded_file = st.file_uploader("Upload .pdf or .txt", type=["pdf", "txt"])
20
+ if uploaded_file and st.button("Upload File"):
21
+ res = requests.post(
22
+ f"{API_URL}/upload/file",
23
+ files={"file": uploaded_file},
24
+ data={"collection": collection_name}
25
+ )
26
+ st.success(res.json().get("message", "Uploaded"))
27
+
28
+ if upload_type == "Web URL":
29
+ url = st.text_input("Enter Web URL")
30
+ if st.button("Upload URL"):
31
+ res = requests.post(f"{API_URL}/upload/url", json={
32
+ "doc_type": "url",
33
+ "content": url,
34
+ "file_name": collection_name,
35
+ })
36
+ st.success(res.json().get("message", "URL Indexed"))
37
+
38
+
39
+ for msg in st.session_state.messages:
40
+ with st.chat_message(msg["role"]):
41
+ st.markdown(msg["content"])
42
+
43
+
44
+ if query := st.chat_input("Ask me anything..."):
45
+ st.session_state.messages.append({"role": "user", "content": query})
46
+ with st.chat_message("user"):
47
+ st.markdown(query)
48
+
49
+ with st.chat_message("assistant"):
50
+ with st.spinner("Thinking..."):
51
+ res = requests.post(f"{API_URL}/chat/", json={
52
+ "query": query,
53
+ "collection": collection_name
54
+ })
55
+
56
+ answer = res.json()["answer"]
57
+ st.markdown(answer)
58
+ st.session_state.messages.append({"role": "assistant", "content": answer})
helper/__init__.py ADDED
File without changes
helper/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (128 Bytes). View file
 
helper/__pycache__/model_load.cpython-310.pyc ADDED
Binary file (912 Bytes). View file
 
helper/model_load.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import Optional, List
3
+
4
+ class UploadDocRequest(BaseModel):
5
+ doc_type: str
6
+ content: Optional[str] = None
7
+ file_name: Optional[str] = None
8
+
9
+ class ChatRequest(BaseModel):
10
+ query: str
11
+ collection: str
12
+
13
+ class ChatResponse(BaseModel):
14
+ answer: str
15
+ sources: Optional[List[str]] = []
requirements.txt CHANGED
@@ -1,3 +1,11 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ streamlit
3
+ python-dotenv
4
+ chromadb
5
+ langchain
6
+ uvicorn
7
+ langchain-huggingface
8
+ langchain-openai
9
+ langchain-core
10
+ langchain-community
11
+ langchain-chroma
routes/__init__.py ADDED
File without changes
routes/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (128 Bytes). View file
 
routes/__pycache__/chat_route.cpython-310.pyc ADDED
Binary file (764 Bytes). View file
 
routes/__pycache__/entry_point.cpython-310.pyc ADDED
Binary file (370 Bytes). View file
 
routes/__pycache__/upload_file_route.cpython-310.pyc ADDED
Binary file (1.03 kB). View file
 
routes/chat_route.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, UploadFile, File, Form
2
+ from helper.model_load import ChatRequest, ChatResponse
3
+ from database.load_database import load_retriever
4
+ from utils.load_rag_chain import generate_response_from_context
5
+
6
+ chat_router = APIRouter(prefix="/chat", tags=['Chat'])
7
+
8
+ @chat_router.post("/", response_model=ChatResponse)
9
+ async def chat_rag(data: ChatRequest):
10
+ retriever = load_retriever(data.collection)
11
+ answer = generate_response_from_context(retriever=retriever, question=data.query)
12
+ return ChatResponse(answer=answer)
routes/entry_point.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .chat_route import chat_router
2
+ from .upload_file_route import upload_file_router
3
+ from fastapi import FastAPI
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+
6
+ app = FastAPI(title="RAG API")
7
+ app.include_router(router=chat_router)
8
+ app.include_router(router=upload_file_router)
9
+ app.add_middleware(
10
+ CORSMiddleware,
11
+ allow_origins=["*"],
12
+ allow_credentials=True,
13
+ allow_methods=["*"],
14
+ allow_headers=["*"],
15
+ )
16
+
17
+ # if __name__ == "__main__":
18
+ # import uvicorn
19
+ # uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
routes/upload_file_route.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, UploadFile, File, Form
2
+ from helper.model_load import UploadDocRequest
3
+ from utils.load_documents import load_document_from_file, load_document_from_url
4
+ from database.create_database import create_or_add_to_collection
5
+
6
+ upload_file_router = APIRouter(prefix="/upload", tags=['Upload'])
7
+
8
+ @upload_file_router.post("/file")
9
+ async def upload(file: UploadFile = File(...), collection: str = Form(...)):
10
+ docs = load_document_from_file(file)
11
+ create_or_add_to_collection(collection, docs)
12
+ return {"message": f"Uploaded and Indexed in {collection}"}
13
+
14
+ @upload_file_router.post("/url")
15
+ async def upload_url(data: UploadDocRequest):
16
+ docs = load_document_from_url(data.content)
17
+ create_or_add_to_collection(data.file_name, docs)
18
+ return {"message": f"Indexed URL to {data.file_name}"}
src/streamlit_app.py DELETED
@@ -1,40 +0,0 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
- import streamlit as st
5
-
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
start.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ uvicorn routes.entry_point:app --host 0.0.0.0 --port 8000 &
4
+
5
+ streamlit run app.py --server.port 7860 --server.address 0.0.0.0
utils/__init__.py ADDED
File without changes
utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (127 Bytes). View file
 
utils/__pycache__/load_documents.cpython-310.pyc ADDED
Binary file (954 Bytes). View file
 
utils/__pycache__/load_llm.cpython-310.pyc ADDED
Binary file (847 Bytes). View file
 
utils/__pycache__/load_rag_chain.cpython-310.pyc ADDED
Binary file (668 Bytes). View file
 
utils/load_documents.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader
2
+ from langchain_core.documents import Document
3
+ import tempfile
4
+
5
+ def load_document_from_file(file) -> list[Document]:
6
+ suffix = "." + file.filename.split(".")[-1]
7
+ temp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
8
+ temp.write(file.file.read())
9
+ temp.close()
10
+
11
+ if suffix == ".pdf":
12
+ loader = PyPDFLoader(temp.name)
13
+ elif suffix == ".txt":
14
+ loader = TextLoader(temp.name)
15
+ else:
16
+ raise ValueError("Unsupported File Type")
17
+ return loader.load()
18
+
19
+ def load_document_from_url(url: str) -> list[Document]:
20
+ loader = WebBaseLoader(url)
21
+ return loader.load()
22
+
23
+ # def load_document_file_or_url(file_or_url) -> list[Document]:
24
+ # # If input is a URL string
25
+ # if isinstance(file_or_url, str) and re.match(r'^https?://', file_or_url):
26
+ # loader = WebBaseLoader(file_or_url)
27
+ # return loader.load()
28
+
29
+ # # Otherwise treat it as a file-like object (e.g. from file upload)
30
+ # suffix = "." + file_or_url.filename.split(".")[-1]
31
+ # temp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
32
+ # temp.write(file_or_url.file.read())
33
+ # temp.close()
34
+
35
+ # if suffix == ".pdf":
36
+ # loader = PyPDFLoader(temp.name)
37
+ # elif suffix == ".txt":
38
+ # loader = TextLoader(temp.name)
39
+ # else:
40
+ # raise ValueError("Unsupported File Type")
41
+
42
+ # return loader.load()
utils/load_llm.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_openai import ChatOpenAI
2
+ from langchain_huggingface import HuggingFaceEndpointEmbeddings
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from config.nodes import OPENROUTER_API_KEY, HUGGINGFACEHUB_API_TOKEN, BASE_URL, LLM_MODEL_NAME, LLM_TEMPERATURE_SET, EMBEDDING_MODEL_NAME, EMBEDDING_MODEL_TASK
5
+ from dotenv import load_dotenv
6
+ import os
7
+
8
+ LLM_MODEL = ChatOpenAI(
9
+ temperature=LLM_TEMPERATURE_SET,
10
+ model=LLM_MODEL_NAME,
11
+ base_url=BASE_URL,
12
+ api_key=OPENROUTER_API_KEY
13
+ )
14
+
15
+ EMBEDDING_MODEL = HuggingFaceEndpointEmbeddings(
16
+ model=EMBEDDING_MODEL_NAME,
17
+ task=EMBEDDING_MODEL_TASK,
18
+ huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
19
+ )
20
+
21
+ TEXT_SPLITTER = RecursiveCharacterTextSplitter(
22
+ chunk_size=1000,
23
+ chunk_overlap=200
24
+ )
utils/load_rag_chain.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.prompts import ChatPromptTemplate
2
+ from langchain_core.runnables import RunnablePassthrough
3
+ from .load_llm import LLM_MODEL
4
+
5
+ def generate_response_from_context(retriever, question: str):
6
+ prompt = ChatPromptTemplate.from_messages([
7
+ ("human", "Answer this using context:\n{context}\n\nQuestion:\n{question}")
8
+ ])
9
+ rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | LLM_MODEL
10
+ return rag_chain.invoke(question).content