himanshukumar378 commited on
Commit
ccfc149
·
verified ·
1 Parent(s): 2d90360

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -84
app.py CHANGED
@@ -1,98 +1,87 @@
1
  import gradio as gr
2
  from PyPDF2 import PdfReader
3
- from langchain.text_splitter import RecursiveCharacterTextSplitter
4
- from langchain_community.embeddings import HuggingFaceEmbeddings
5
- from langchain_community.vectorstores import FAISS
 
6
  from langchain_community.llms import HuggingFacePipeline
7
- from langchain.chains import ConversationalRetrievalChain
8
- from transformers import pipeline
9
 
10
- # -------------------------------
11
- # PDF TEXT LOADER
12
- # -------------------------------
13
- def load_pdfs(pdf_files):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  text = ""
15
- for pdf in pdf_files:
16
- pdf_reader = PdfReader(pdf.name) # use .name for gradio file objects
17
- for page in pdf_reader.pages:
18
- page_text = page.extract_text()
19
- if page_text:
20
- text += page_text
21
- return text
22
-
23
- # -------------------------------
24
- # BUILD VECTORSTORE
25
- # -------------------------------
26
- def build_vectorstore(text):
27
- splitter = RecursiveCharacterTextSplitter(
28
- chunk_size=1000,
29
- chunk_overlap=200,
30
- length_function=len
31
- )
32
  chunks = splitter.split_text(text)
33
 
34
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
35
- vectorstore = FAISS.from_texts(chunks, embeddings)
36
- return vectorstore
37
-
38
- # -------------------------------
39
- # SETUP QA CHAIN
40
- # -------------------------------
41
- def build_conversation_chain(vectorstore):
42
- llm_pipeline = pipeline(
43
- "text2text-generation",
44
- model="google/flan-t5-base", # lightweight, fast model
45
- tokenizer="google/flan-t5-base",
46
- max_new_tokens=256
47
- )
48
- llm = HuggingFacePipeline(pipeline=llm_pipeline)
49
-
50
- qa_chain = ConversationalRetrievalChain.from_llm(
51
- llm=llm,
52
- retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
53
- return_source_documents=False
54
- )
55
- return qa_chain
56
-
57
- # -------------------------------
58
- # GRADIO INTERFACE
59
- # -------------------------------
60
- conversation_chain = None
61
- chat_history = []
62
-
63
- def process_pdfs(pdf_files):
64
- global conversation_chain, chat_history
65
- chat_history = [] # reset history
66
- text = load_pdfs(pdf_files)
67
- vs = build_vectorstore(text)
68
- conversation_chain = build_conversation_chain(vs)
69
- return "✅ PDFs processed successfully. You can now ask questions!"
70
-
71
- def chat(message, history):
72
- global conversation_chain, chat_history
73
- if not conversation_chain:
74
- return "⚠️ Please upload and process PDFs first."
75
-
76
- response = conversation_chain({"question": message, "chat_history": chat_history})
77
- answer = response["answer"]
78
- chat_history.append((message, answer))
79
- return answer
80
 
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  with gr.Blocks() as demo:
83
- gr.Markdown("## 📚 Multiple PDF Chatbot")
84
-
85
- with gr.Row():
86
- pdf_input = gr.File(file_types=[".pdf"], file_types_display="PDF Files", file_types_visible=True, file_types_select_multiple=True, label="Upload PDFs", type="file", file_types_accept_multiple=True)
87
- process_btn = gr.Button("Process PDFs")
88
-
89
- output_status = gr.Textbox(label="Status", interactive=False)
90
 
91
- chatbot = gr.Chatbot()
92
- msg = gr.Textbox(label="Your Question")
93
- send_btn = gr.Button("Ask")
 
94
 
95
- process_btn.click(process_pdfs, inputs=pdf_input, outputs=output_status)
96
- send_btn.click(chat, inputs=[msg, chatbot], outputs=chatbot)
97
 
98
  demo.launch()
 
1
  import gradio as gr
2
  from PyPDF2 import PdfReader
3
+ from langchain.text_splitter import CharacterTextSplitter
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain.vectorstores import FAISS
6
+ from langchain.chains.question_answering import load_qa_chain
7
  from langchain_community.llms import HuggingFacePipeline
8
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 
9
 
10
+
11
+ # ----------------------------
12
+ # Lazy load model & embeddings
13
+ # ----------------------------
14
+ def load_llm():
15
+ model_id = "google/flan-t5-base" # lightweight model for Hugging Face Spaces
16
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
17
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
18
+ pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
19
+ llm = HuggingFacePipeline(pipeline=pipe)
20
+ return llm
21
+
22
+
23
+ def load_embeddings():
24
+ return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
25
+
26
+
27
+ # ----------------------------
28
+ # Process PDF
29
+ # ----------------------------
30
+ def process_pdf(pdf_file):
31
+ pdf_reader = PdfReader(pdf_file.name)
32
  text = ""
33
+ for page in pdf_reader.pages:
34
+ text += page.extract_text() or ""
35
+
36
+ if not text.strip():
37
+ return None, "❌ No extractable text found in PDF!"
38
+
39
+ # Split text
40
+ splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
 
 
 
 
 
 
 
 
 
41
  chunks = splitter.split_text(text)
42
 
43
+ # Create embeddings + FAISS index
44
+ embeddings = load_embeddings()
45
+ knowledge_base = FAISS.from_texts(chunks, embeddings)
46
+
47
+ return knowledge_base, "✅ PDF processed successfully!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
 
50
+ # ----------------------------
51
+ # Chat Function
52
+ # ----------------------------
53
+ def chat_with_pdf(pdf_file, query, history=[]):
54
+ if pdf_file is None:
55
+ return history + [["User: " + query, "⚠️ Please upload a PDF first!"]]
56
+
57
+ # Process PDF
58
+ knowledge_base, msg = process_pdf(pdf_file)
59
+ if knowledge_base is None:
60
+ return history + [["System", msg]]
61
+
62
+ # Run LLM QA Chain
63
+ llm = load_llm()
64
+ chain = load_qa_chain(llm, chain_type="stuff")
65
+
66
+ docs = knowledge_base.similarity_search(query, k=3)
67
+ answer = chain.run(input_documents=docs, question=query)
68
+
69
+ history.append(["User: " + query, "Bot: " + answer])
70
+ return history
71
+
72
+
73
+ # ----------------------------
74
+ # Gradio UI
75
+ # ----------------------------
76
  with gr.Blocks() as demo:
77
+ gr.Markdown("## 📄 Multiple PDF Chatbot (LangChain + Hugging Face)")
 
 
 
 
 
 
78
 
79
+ with gr.Row():
80
+ pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
81
+ query = gr.Textbox(label="Ask a question about the PDF")
82
+ chatbot = gr.Chatbot(label="Conversation")
83
 
84
+ btn = gr.Button("Ask")
85
+ btn.click(fn=chat_with_pdf, inputs=[pdf_file, query, chatbot], outputs=chatbot)
86
 
87
  demo.launch()