amn-sdqi commited on
Commit
676f2da
·
1 Parent(s): 1a89d3b

RAG Integrated

Browse files
Files changed (3) hide show
  1. app.py +64 -45
  2. app_bkp.py +51 -20
  3. requirements.txt +9 -3
app.py CHANGED
@@ -1,68 +1,87 @@
1
  import streamlit as st
2
- from transformers import pipeline
3
- import fitz
 
 
 
 
 
 
 
 
4
 
5
- qa = pipeline("question-answering", model="deepset/roberta-base-squad2", device=0)
6
- text_gen = pipeline("text2text-generation", model="google/flan-t5-base", device=0)
7
 
 
 
8
 
9
- # extract text from uploaded document
10
- def extract_PDF(file):
11
- text = ""
12
- with fitz.open(stream=file.read(), filetype="pdf") as doc:
13
- for page in doc:
14
- text += page.get_text()
15
- return text
16
 
 
17
 
18
- # ------------------------------------------------------------------------------
19
 
20
- # -----------------------------------Streamlit UI--------------------------------
 
 
21
 
22
- st.title("Chatbot with Huggingface")
 
23
 
24
- st.subheader("Upload file")
 
 
25
  pdf_file = st.file_uploader("Upload", type="pdf")
26
 
27
- # Initialize Session state for convo history
 
 
 
 
 
28
 
29
- if "chat_history" not in st.session_state:
30
- st.session_state.chat_history = []
 
 
 
 
31
 
32
- if "context" not in st.session_state:
33
- st.session_state.context = None
 
 
 
 
34
 
35
- # extract text and store in the session
36
- if pdf_file is not None and st.session_state.context is None:
37
- st.session_state.context = extract_PDF(pdf_file)
38
 
 
 
 
39
 
40
- # Chat section
41
 
42
- if st.session_state.context:
43
- st.subheader("Chat with the PDF")
44
 
45
- question = st.text_input("You", key="user_input")
46
 
47
-
48
  if question:
49
- result = qa(question=question, context=st.session_state.context)
50
-
51
- context_chunk = st.session_state.context[:1500]
52
- prompt = f"Context: {context_chunk}\nQuestion: {question}\nAnswer:"
53
-
54
- generated = text_gen(prompt, max_length=100)[0]['generated_text']
55
-
56
- # save convo
57
- st.session_state.chat_history.append(
58
- {"user": question, "bot": generated}
59
- )
60
-
61
- # Display chat
62
 
 
63
  for chat in st.session_state.chat_history:
64
- st.markdown(f"**You:** {chat['user']}")
65
- st.markdown(f"**Bot:** {chat['bot']}")
66
-
 
 
 
 
 
67
  else:
68
- st.info("Please upload PDF to begin")
 
1
  import streamlit as st
2
+ from langchain.document_loaders import PyPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import Chroma
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain.chains import RetrievalQA
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+ import tempfile
9
+ import os
10
+ from dotenv import load_dotenv
11
+ from pydantic import SecretStr
12
 
 
 
13
 
14
+ load_dotenv()
15
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
16
 
17
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
 
 
 
 
 
18
 
19
+ # ---------------------------- SETUP ----------------------------
20
 
21
+ st.title("📄 LangChain RAG Chatbot")
22
 
23
+ # Session state
24
+ if "chat_history" not in st.session_state:
25
+ st.session_state.chat_history = []
26
 
27
+ if "qa_chain" not in st.session_state:
28
+ st.session_state.qa_chain = None
29
 
30
+ # ---------------------------- FILE UPLOAD ----------------------------
31
+
32
+ st.subheader("Upload your PDF")
33
  pdf_file = st.file_uploader("Upload", type="pdf")
34
 
35
+ if pdf_file is not None and st.session_state.qa_chain is None:
36
+ with st.spinner("🔍 Processing document..."):
37
+ # Save file temporarily
38
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
39
+ tmp_file.write(pdf_file.read())
40
+ tmp_path = tmp_file.name
41
 
42
+ # Load and split PDF
43
+ loader = PyPDFLoader(tmp_path)
44
+ documents = loader.load_and_split()
45
+
46
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
47
+ chunks = splitter.split_documents(documents)
48
 
49
+ # Vector store
50
+
51
+ vectordb = Chroma.from_documents(
52
+ chunks, embeddings, persist_directory="./chroma_db"
53
+ )
54
+ retriever = vectordb.as_retriever()
55
 
56
+ # QA Chain
57
+ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", api_key=SecretStr(GOOGLE_API_KEY) if GOOGLE_API_KEY else None)
58
+ qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
59
 
60
+ # Store in session
61
+ st.session_state.qa_chain = qa_chain
62
+ st.success("✅ Document processed and indexed!")
63
 
64
+ # ---------------------------- CHAT ----------------------------
65
 
66
+ if st.session_state.qa_chain:
67
+ st.subheader("💬 Ask a question")
68
 
69
+ question = st.text_input("You:", key="user_input")
70
 
 
71
  if question:
72
+ with st.spinner("🤖 Generating answer..."):
73
+ answer = st.session_state.qa_chain.run(question)
74
+ st.session_state.chat_history.append({"user": question, "bot": answer})
 
 
 
 
 
 
 
 
 
 
75
 
76
+ # Display chat history
77
  for chat in st.session_state.chat_history:
78
+ st.markdown(f"🧑 **You:** {chat['user']}")
79
+ st.markdown(f"🤖 **Bot:** {chat['bot']}")
80
+
81
+ # Reset button
82
+ if st.button("🔄 Reset Chat"):
83
+ st.session_state.chat_history = []
84
+ st.session_state.qa_chain = None
85
+ st.rerun()
86
  else:
87
+ st.info("📂 Please upload a PDF to begin.")
app_bkp.py CHANGED
@@ -1,37 +1,68 @@
1
  import streamlit as st
2
  from transformers import pipeline
3
- import fitz # PyMuPDF
4
 
 
 
5
 
6
 
7
- question_answerer = pipeline(
8
- "question-answering", model="distilbert-base-cased-distilled-squad"
9
- )
10
-
11
- # Extract text from uploaded PDF
12
- def extract_text_from_pdf(uploaded_file):
13
  text = ""
14
- with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
15
  for page in doc:
16
- text += page.get_text()
17
  return text
18
 
19
 
20
- st.title("PDF Chatbot with Transformers")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- st.subheader("Upload a PDF Document")
23
- pdf_file = st.file_uploader("Upload your PDF file", type="pdf")
 
24
 
25
- if pdf_file is not None:
26
- context = extract_text_from_pdf(pdf_file)
27
 
28
- st.subheader("Ask a Question")
29
- question = st.text_input("Your question:", "What is this document about?")
30
 
 
 
 
 
 
 
31
  if question:
32
- result = question_answerer(question=question, context=context)
33
- st.write("**Answer:**", result["answer"])
34
- else:
35
- st.info("Please upload a PDF to begin.")
 
 
36
 
 
 
 
 
37
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from transformers import pipeline
3
+ import fitz
4
 
5
+ qa = pipeline("question-answering", model="deepset/roberta-base-squad2", device=0)
6
+ text_gen = pipeline("text2text-generation", model="google/flan-t5-base", device=0)
7
 
8
 
9
+ # extract text from uploaded document
10
+ def extract_PDF(file):
 
 
 
 
11
  text = ""
12
+ with fitz.open(stream=file.read(), filetype="pdf") as doc:
13
  for page in doc:
14
+ text += page.get_text() # type: ignore
15
  return text
16
 
17
 
18
+ # ------------------------------------------------------------------------------
19
+
20
+ # -----------------------------------Streamlit UI--------------------------------
21
+
22
+ st.title("Chatbot with Huggingface")
23
+
24
+ st.subheader("Upload file")
25
+ pdf_file = st.file_uploader("Upload", type="pdf")
26
+
27
+ # Initialize Session state for convo history
28
+
29
+ if "chat_history" not in st.session_state:
30
+ st.session_state.chat_history = []
31
+
32
+ if "context" not in st.session_state:
33
+ st.session_state.context = None
34
 
35
+ # extract text and store in the session
36
+ if pdf_file is not None and st.session_state.context is None:
37
+ st.session_state.context = extract_PDF(pdf_file)
38
 
 
 
39
 
40
+ # Chat section
 
41
 
42
+ if st.session_state.context:
43
+ st.subheader("Chat with the PDF")
44
+
45
+ question = st.text_input("You", key="user_input")
46
+
47
+
48
  if question:
49
+ result = qa(question=question, context=st.session_state.context) # type: ignore
50
+
51
+ context_chunk = st.session_state.context[:1500]
52
+ prompt = f"Context: {context_chunk}\nQuestion: {question}\nAnswer:"
53
+
54
+ generated = text_gen(prompt, max_length=100)[0]['generated_text'] # type: ignore
55
 
56
+ # save convo
57
+ st.session_state.chat_history.append(
58
+ {"user": question, "bot": generated}
59
+ )
60
 
61
+ # Display chat
62
+
63
+ for chat in st.session_state.chat_history:
64
+ st.markdown(f"**You:** {chat['user']}")
65
+ st.markdown(f"**Bot:** {chat['bot']}")
66
+
67
+ else:
68
+ st.info("Please upload PDF to begin")
requirements.txt CHANGED
@@ -1,6 +1,12 @@
1
  streamlit
2
- torch
3
- accelerate
 
 
4
  transformers
5
  tf-keras
6
- PyMuPDF
 
 
 
 
 
1
  streamlit
2
+ openai
3
+ langchain-google-genai
4
+ langchain-core
5
+ langchain-text-splitters
6
  transformers
7
  tf-keras
8
+ langchain
9
+ chromadb
10
+ tiktoken
11
+ pypdf
12
+ sentence-transformers