PranavReddy18 commited on
Commit
ca767c0
·
verified ·
1 Parent(s): faa5248

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/ix[[:space:]]biology[[:space:]]em.pdf filter=lfs diff=lfs merge=lfs -text
37
+ data/x[[:space:]]biology[[:space:]]em.pdf filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+
4
+ st.set_page_config(page_title="BioRAG Assistant 🧬", page_icon="🧪", layout="wide")
5
+
6
+ st.markdown("<h1 style='text-align: center;'>🧠 BioRAG: Biology Learning Assistant</h1>", unsafe_allow_html=True)
7
+ st.markdown("<p style='text-align: center; color: gray;'>Ask your biology questions and get accurate, syllabus-based answers.</p>", unsafe_allow_html=True)
8
+
9
+ with st.form("chat_form", clear_on_submit=True):
10
+ question = st.text_input("Ask your question:", placeholder="e.g. What is the function of mitochondria?")
11
+ submitted = st.form_submit_button("Ask")
12
+
13
+ API_URL = "http://backend:2000/predict"
14
+
15
+ if submitted and question:
16
+ with st.spinner("Thinking..."):
17
+ try:
18
+ response = requests.post(API_URL, json={"question": question})
19
+ answer = response.json().get("answer", "Sorry, no answer found.")
20
+ except Exception as e:
21
+ answer = f"⚠️ Error: {e}"
22
+
23
+ st.markdown(f"**🧑 You:** {question}")
24
+ st.markdown(
25
+ f"<div style='background-color: #f1f1f1; padding: 15px; border-radius: 10px;'><strong>🧬 BioRAG:</strong><br>{answer}</div>",
26
+ unsafe_allow_html=True
27
+ )
28
+
backend.Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile.backend
2
+ FROM python:3.10-slim
3
+
4
+ WORKDIR /app
5
+
6
+ COPY requirements.txt .
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ COPY main.py .
10
+ COPY data /app/data
11
+ EXPOSE 2000
12
+
13
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "2000"]
data/ix biology em.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72fb3a53a523375e6fca8a90707dadecd04a482f744a3965f435b5820e1dc5c6
3
+ size 16715077
data/x biology em.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4590d0f7c59f1965d42932feb4eeed6a1f7517fc470ed79c488641d6f82509f9
3
+ size 10402338
docker-compose.yml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ backend:
5
+ build:
6
+ context: .
7
+ dockerfile: backend.Dockerfile
8
+ ports:
9
+ - "2000:2000"
10
+ environment:
11
+ - GROQ_API_KEY=${GROQ_API_KEY}
12
+ restart: always
13
+
14
+ frontend:
15
+ build:
16
+ context: .
17
+ dockerfile: frontend.Dockerfile
18
+ ports:
19
+ - "8501:8501"
20
+ depends_on:
21
+ - backend
22
+ restart: always
frontend.Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY app.py .
9
+ COPY data /app/data
10
+ EXPOSE 8501
11
+
12
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
main.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from typing import List
4
+ from dotenv import load_dotenv
5
+ import os
6
+ import uvicorn
7
+
8
+ load_dotenv()
9
+
10
+ app = FastAPI(title="A RAG-Driven Learning Assistant for Biology")
11
+
12
+ from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
13
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
14
+ from langchain_community.embeddings import HuggingFaceEmbeddings
15
+ from langchain_community.vectorstores import FAISS
16
+ from langchain.schema import Document, BaseRetriever
17
+ from sentence_transformers import CrossEncoder
18
+ from langchain.chains import ConversationalRetrievalChain
19
+ from langchain.memory import ConversationBufferWindowMemory
20
+ from langchain.prompts import ChatPromptTemplate
21
+ from langchain_groq import ChatGroq
22
+
23
+ loader = DirectoryLoader('data/', glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader)
24
+ documents = loader.load()
25
+
26
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
27
+ chunks = splitter.split_documents(documents)
28
+
29
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
30
+ vectorstore = FAISS.from_documents(chunks, embeddings)
31
+
32
+ GROQ_API_KEY = os.getenv('GROQ_API_KEY')
33
+ if not GROQ_API_KEY:
34
+ raise ValueError("GROQ_API_KEY is not set in the environment variables")
35
+
36
+ llm = ChatGroq(api_key=GROQ_API_KEY, model='llama-3.3-70b-versatile')
37
+
38
+ prompt = ChatPromptTemplate.from_messages([
39
+ ("system", "You are a helpful and knowledgeable biology tutor. Answer clearly and accurately. If the query is out of syllabus, just respond with 'Out of syllabus'."),
40
+ ("human", "Context:\n{context}\n\nQuestion: {question}")
41
+ ])
42
+
43
+ memory = ConversationBufferWindowMemory(
44
+ memory_key="chat_history",
45
+ return_messages=True,
46
+ k=3
47
+ )
48
+
49
+ reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
50
+
51
+ def rerank_documents(query: str, retrieved_docs: List[Document]) -> List[Document]:
52
+ docs_texts = [doc.page_content for doc in retrieved_docs]
53
+ pairs = [(query, doc_text) for doc_text in docs_texts]
54
+ scores = reranker.predict(pairs)
55
+ sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
56
+ return sorted_docs
57
+
58
+ class RerankRetriever(BaseRetriever, BaseModel):
59
+ base_retriever: BaseRetriever
60
+ top_k: int = 5
61
+
62
+ def _get_relevant_documents(self, query: str) -> List[Document]:
63
+ initial_docs = self.base_retriever.invoke(query)
64
+ reranked_docs = rerank_documents(query, initial_docs)
65
+ return reranked_docs[:self.top_k]
66
+
67
+ base_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
68
+ custom_retriever = RerankRetriever(base_retriever=base_retriever, top_k=5)
69
+
70
+ qa_chain = ConversationalRetrievalChain.from_llm(
71
+ llm=llm,
72
+ retriever=custom_retriever,
73
+ memory=memory,
74
+ combine_docs_chain_kwargs={"prompt": prompt}
75
+ )
76
+
77
+ class QuestionInput(BaseModel):
78
+ question: str
79
+
80
+ @app.post("/predict")
81
+ def predict(input: QuestionInput):
82
+ result = qa_chain({"question": input.question})
83
+ return {"answer": result["answer"]}
84
+
85
+ if __name__ == "__main__":
86
+ uvicorn.run(app, host='0.0.0.0', port=2000)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-groq
3
+ langchain-huggingface
4
+ langchain-community
5
+ python-dotenv
6
+ pypdf
7
+ faiss-cpu
8
+ fastapi
9
+ uvicorn
10
+ pydantic
11
+ streamlit
12
+ pytest
13
+ huggingface_hub[hf_xet]