himanshukumar378 commited on
Commit
67d85a4
·
verified ·
1 Parent(s): 8096e94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -70
app.py CHANGED
@@ -1,82 +1,96 @@
1
- # app.py
2
- import streamlit as st
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter
5
- from langchain_community.embeddings import HuggingFaceEmbeddings
6
- from langchain_community.vectorstores import FAISS
7
  from langchain.chains import ConversationalRetrievalChain
8
  from langchain.memory import ConversationBufferMemory
9
- from langchain_community.llms import HuggingFacePipeline
10
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
11
- from streamlit_chat import message
12
 
13
- # ----------------- Initialize session state -----------------
14
- if "processComplete" not in st.session_state:
15
- st.session_state.processComplete = False
 
 
 
16
 
17
- if "conversation" not in st.session_state:
18
- st.session_state.conversation = None
19
 
20
- if "chat_history" not in st.session_state:
21
- st.session_state.chat_history = []
 
22
 
23
- # ----------------- Main function -----------------
24
- def main():
25
- st.set_page_config(page_title="PDF Chatbot", layout="wide")
26
- st.title("📑 Chat with Multiple PDFs")
27
 
28
- uploaded_files = st.file_uploader(
29
- "Upload your PDFs", type=["pdf"], accept_multiple_files=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  )
31
 
32
- if uploaded_files and st.button("Process PDFs"):
33
- with st.spinner("Processing PDFs..."):
34
- all_text = ""
35
- for file in uploaded_files:
36
- pdf_reader = PdfReader(file)
37
- for page in pdf_reader.pages:
38
- text = page.extract_text()
39
- if text:
40
- all_text += text
41
-
42
- # Split text into chunks
43
- text_splitter = CharacterTextSplitter(
44
- separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
45
- )
46
- chunks = text_splitter.split_text(all_text)
47
-
48
- # Create embeddings
49
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
50
- vectorstore = FAISS.from_texts(chunks, embeddings)
51
-
52
- # Load HuggingFace model (Seq2Seq for QA)
53
- model_name = "google/flan-t5-small" # lightweight & fast
54
- tokenizer = AutoTokenizer.from_pretrained(model_name)
55
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
56
- pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
57
- llm = HuggingFacePipeline(pipeline=pipe)
58
-
59
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
60
- st.session_state.conversation = ConversationalRetrievalChain.from_llm(
61
- llm=llm, retriever=vectorstore.as_retriever(), memory=memory
62
- )
63
-
64
- st.session_state.processComplete = True
65
- st.success("✅ PDFs processed successfully!")
66
-
67
- # ----------------- Chat Section -----------------
68
- if st.session_state.processComplete and st.session_state.conversation:
69
- user_question = st.text_input("Ask a question about your PDFs:")
70
-
71
- if user_question:
72
- response = st.session_state.conversation({"question": user_question})
73
- answer = response["answer"]
74
- st.session_state.chat_history.append((user_question, answer))
75
-
76
- # Display chat
77
- for i, (q, a) in enumerate(st.session_state.chat_history):
78
- message(q, is_user=True, key=f"user_{i}")
79
- message(a, is_user=False, key=f"bot_{i}")
80
-
81
- if __name__ == "__main__":
82
- main()
 
1
+ import gradio as gr
 
2
  from PyPDF2 import PdfReader
3
  from langchain.text_splitter import CharacterTextSplitter
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain.vectorstores import FAISS
6
  from langchain.chains import ConversationalRetrievalChain
7
  from langchain.memory import ConversationBufferMemory
 
8
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
9
+ from langchain.llms import HuggingFacePipeline
10
 
11
+ # -----------------------
12
+ # Load LLM model locally
13
+ # -----------------------
14
+ model_name = "google/flan-t5-small" # keep small for Spaces, you can change
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
17
 
18
+ pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
19
+ llm = HuggingFacePipeline(pipeline=pipe)
20
 
21
+ # Global variables
22
+ db = None
23
+ conversation = None
24
 
 
 
 
 
25
 
26
+ # -----------------------
27
+ # Step 1: Process PDFs
28
+ # -----------------------
29
+ def process_pdfs(files):
30
+ global db, conversation
31
+
32
+ text = ""
33
+ for file in files:
34
+ pdf = PdfReader(file.name)
35
+ for page in pdf.pages:
36
+ text += page.extract_text() or ""
37
+
38
+ # Split text
39
+ splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
40
+ chunks = splitter.split_text(text)
41
+
42
+ # Embeddings
43
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
44
+
45
+ # Vector DB
46
+ db = FAISS.from_texts(chunks, embeddings)
47
+
48
+ # Memory
49
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
50
+
51
+ # Conversation Chain
52
+ conversation = ConversationalRetrievalChain.from_llm(
53
+ llm=llm,
54
+ retriever=db.as_retriever(),
55
+ memory=memory
56
  )
57
 
58
+ return "✅ PDFs processed! You can now start chatting."
59
+
60
+
61
+ # -----------------------
62
+ # Step 2: Chat Function
63
+ # -----------------------
64
+ def chat(user_input):
65
+ global conversation, db
66
+
67
+ if conversation is None or db is None:
68
+ return "⚠️ Please upload and process PDFs first.", []
69
+
70
+ result = conversation({"question": user_input})
71
+ answer = result["answer"]
72
+
73
+ return answer, result["chat_history"]
74
+
75
+
76
+ # -----------------------
77
+ # Gradio UI
78
+ # -----------------------
79
+ with gr.Blocks() as demo:
80
+ gr.Markdown("## 📚 Multiple PDF Chatbot")
81
+
82
+ with gr.Row():
83
+ pdfs = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs")
84
+ process_btn = gr.Button("Process PDFs")
85
+
86
+ status = gr.Textbox(label="Status", interactive=False)
87
+
88
+ chatbot = gr.Chatbot()
89
+ user_msg = gr.Textbox(label="Ask a question about your PDFs")
90
+ send_btn = gr.Button("Send")
91
+
92
+ # Actions
93
+ process_btn.click(process_pdfs, inputs=[pdfs], outputs=[status])
94
+ send_btn.click(chat, inputs=[user_msg], outputs=[chatbot, chatbot])
95
+
96
+ demo.launch()