manabb commited on
Commit
bbe6774
·
verified ·
1 Parent(s): e4e5c5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -80
app.py CHANGED
@@ -1,87 +1,186 @@
1
  # app.py
2
- import os
3
  import gradio as gr
4
-
5
- from langchain.vectorstores import FAISS
6
  from langchain.embeddings import HuggingFaceEmbeddings
7
- from langchain.document_loaders import TextLoader
8
- from langchain_text_splitters import RecursiveCharacterTextSplitter
9
  from langchain.chains import RetrievalQA
10
- from langchain.llms import HuggingFacePipeline
11
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
12
-
13
- # Optional: Set HF Token if needed
14
- # os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'hf_XXXX'
15
-
16
- # Initialize embedding model
17
- embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
18
-
19
- # Load HF model (lightweight for CPU)
20
- model_name = "google/flan-t5-small"
21
- tokenizer = AutoTokenizer.from_pretrained(model_name)
22
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
23
-
24
- # Wrap in pipeline
25
- pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
26
- llm = HuggingFacePipeline(pipeline=pipe)
27
-
28
- def process_file(file_path):
29
- # Load & split document
30
- loader = TextLoader(file_path)
31
- documents = loader.load()
32
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
33
- docs = text_splitter.split_documents(documents)
34
-
35
- # Create vector DB
36
- vector_db = FAISS.from_documents(docs, embedding_model)
37
- retriever = vector_db.as_retriever()
38
-
39
- # Setup RetrievalQA chain
40
- qa_chain = RetrievalQA.from_chain_type(
41
- llm=llm,
42
- chain_type="stuff",
43
- retriever=retriever
44
- )
45
-
46
- return qa_chain
47
-
48
- # Store the QA chain globally (across UI events)
49
- qa_chain = None
50
-
51
- def upload_and_prepare(file):
52
- global qa_chain
53
- # qa_chain = process_file(file)
54
- qa_chain = process_file(file.name)
55
- return "✅ Document processed. You can now ask questions!"
56
-
57
- def ask_question(query):
58
- if not qa_chain:
59
- return " Please upload a document first."
60
- response = qa_chain.invoke({"query": query})
61
- return response["result"]
62
-
63
- # Gradio UI
64
- with gr.Blocks() as demo:
65
- gr.Markdown("## 🧠 Ask Questions About Your Document (LangChain + Hugging Face)")
66
-
67
- with gr.Row():
68
- file_input = gr.File(label="📄 Upload .txt File", type="filepath")
69
- upload_btn = gr.Button("🔄 Process Document")
70
-
71
- upload_output = gr.Textbox(label="📁 Status", interactive=False)
72
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  with gr.Row():
74
- query_input = gr.Textbox(label="❓ Your Question")
75
- query_btn = gr.Button("🧠 Get Answer")
76
-
77
- answer_output = gr.Textbox(label=" Answer", lines=4)
78
-
79
- upload_btn.click(upload_and_prepare, inputs=file_input, outputs=upload_output)
80
- query_btn.click(ask_question, inputs=query_input, outputs=answer_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- # For local dev use: demo.launch()
83
- # For HF Spaces
84
  if __name__ == "__main__":
85
- demo.launch()
86
-
87
-
 
1
  # app.py
 
2
  import gradio as gr
3
+ from langchain.document_loaders import PyPDFLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.llms import HuggingFaceHub
8
  from langchain.chains import RetrievalQA
9
+ from langchain.prompts import PromptTemplate
10
+ import os
11
+ import tempfile
12
+ import datetime
13
+
14
+ class PDFChatbotWithGradio:
15
+ def __init__(self):
16
+ self.vectorstore = None
17
+ self.qa_chain = None
18
+ self.embeddings = HuggingFaceEmbeddings(
19
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
20
+ )
21
+ self.is_ready = False
22
+
23
+ def process_pdf(self, file_obj):
24
+ """Process uploaded PDF file - fixed to handle Gradio File object"""
25
+ try:
26
+ if file_obj is None:
27
+ return "Please select a PDF file first!"
28
+
29
+ # Extract the file path from Gradio's NamedString object
30
+ # Gradio File component returns an object with 'name' attribute
31
+ file_path = file_obj.name
32
+
33
+ # Load PDF using the file path
34
+ loader = PyPDFLoader(file_path)
35
+ documents = loader.load()
36
+
37
+ if not documents:
38
+ return "No content could be extracted from the PDF."
39
+
40
+ # Split text into chunks
41
+ text_splitter = RecursiveCharacterTextSplitter(
42
+ chunk_size=1000,
43
+ chunk_overlap=200
44
+ )
45
+
46
+ chunks = text_splitter.split_documents(documents)
47
+
48
+ # Create vector store
49
+ self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
50
+
51
+ # Setup QA chain
52
+ self.setup_qa_chain()
53
+
54
+ self.is_ready = True
55
+ return f"PDF processed successfully! Loaded {len(documents)} pages and created {len(chunks)} chunks."
56
+
57
+ except Exception as e:
58
+ return f"Error processing PDF: {str(e)}"
59
+
60
+ def setup_qa_chain(self):
61
+ """Set up the question-answering chain"""
62
+ # Initialize the language model
63
+ llm = HuggingFaceHub(
64
+ repo_id="google/flan-t5-small",
65
+ model_kwargs={"temperature": 0.1, "max_length": 512}
66
+ )
67
+
68
+ # Custom prompt template
69
+ prompt_template = """You are a helpful assistant that answers questions based on the provided context.
70
+
71
+ Context: {context}
72
+
73
+ Question: {question}
74
+
75
+ Please provide a clear and concise answer based on the context above.
76
+ If the answer cannot be found in the context, say "I don't know based on the document."
77
+
78
+ Answer: """
79
+
80
+ PROMPT = PromptTemplate(
81
+ template=prompt_template,
82
+ input_variables=["context", "question"]
83
+ )
84
+
85
+ # Create retrieval QA chain
86
+ self.qa_chain = RetrievalQA.from_chain_type(
87
+ llm=llm,
88
+ chain_type="stuff",
89
+ retriever=self.vectorstore.as_retriever(search_kwargs={"k": 3}),
90
+ chain_type_kwargs={"prompt": PROMPT},
91
+ return_source_documents=True
92
+ )
93
+
94
+ def ask_question(self, question, history):
95
+ """Ask a question and get answer from the chatbot"""
96
+ if not self.is_ready:
97
+ return "Please upload and process a PDF first!", history
98
+
99
+ if not question.strip():
100
+ return "", history
101
+
102
+ try:
103
+ result = self.qa_chain({"query": question})
104
+ answer = result["result"]
105
+
106
+ # Format response with sources
107
+ response = f"{answer}\n\n**Sources:**"
108
+
109
+ for i, doc in enumerate(result["source_documents"][:2]):
110
+ page_num = doc.metadata.get('page', 'N/A')
111
+ if isinstance(page_num, int):
112
+ page_num += 1 # Convert to 1-indexed for user readability
113
+ content_preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
114
+ response += f"\n{i+1}. Page {page_num}: {content_preview}"
115
+
116
+ # Update chat history
117
+ history.append((question, response))
118
+ return "", history
119
+
120
+ except Exception as e:
121
+ error_msg = f"Error: {str(e)}"
122
+ history.append((question, error_msg))
123
+ return "", history
124
+
125
+ # Create chatbot instance
126
+ chatbot = PDFChatbotWithGradio()
127
+
128
+ # Create Gradio interface
129
+ with gr.Blocks(title="PDF Chatbot Agent", theme=gr.themes.Soft()) as demo:
130
+ gr.Markdown("# 📄 PDF Chatbot Agent")
131
+ gr.Markdown("Upload a PDF document and ask questions about its content!")
132
+
133
  with gr.Row():
134
+ with gr.Column(scale=1):
135
+ pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
136
+ upload_status = gr.Textbox(label="Upload Status", interactive=False)
137
+ process_btn = gr.Button("Process PDF", variant="primary")
138
+
139
+ with gr.Column(scale=2):
140
+ chatbot_interface = gr.Chatbot(label="Chat", height=400)
141
+ question_input = gr.Textbox(label="Your Question", placeholder="Ask a question about the PDF...")
142
+ with gr.Row():
143
+ submit_btn = gr.Button("Ask Question")
144
+ clear_btn = gr.Button("Clear Chat")
145
+
146
+ # Event handlers
147
+ process_btn.click(
148
+ fn=chatbot.process_pdf,
149
+ inputs=pdf_upload,
150
+ outputs=upload_status
151
+ )
152
+
153
+ def ask_and_clear(question, history):
154
+ return chatbot.ask_question(question, history)
155
+
156
+ submit_btn.click(
157
+ fn=ask_and_clear,
158
+ inputs=[question_input, chatbot_interface],
159
+ outputs=[question_input, chatbot_interface]
160
+ )
161
+
162
+ question_input.submit(
163
+ fn=ask_and_clear,
164
+ inputs=[question_input, chatbot_interface],
165
+ outputs=[question_input, chatbot_interface]
166
+ )
167
+
168
+ clear_btn.click(
169
+ fn=lambda: [],
170
+ inputs=[],
171
+ outputs=chatbot_interface
172
+ )
173
+
174
+ gr.Examples(
175
+ examples=[
176
+ "What is the main topic of this document?",
177
+ "Can you summarize the key points?",
178
+ "What are the main conclusions?",
179
+ "List the important findings mentioned."
180
+ ],
181
+ inputs=question_input
182
+ )
183
 
184
+ # Launch the application
 
185
  if __name__ == "__main__":
186
+ demo.launch(share=True)