manabb commited on
Commit
11bfceb
Β·
verified Β·
1 Parent(s): 3b7db7f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -179
app.py CHANGED
@@ -1,186 +1,90 @@
 
1
  # app.py
 
2
  import gradio as gr
3
- from langchain.document_loaders import PyPDFLoader
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.vectorstores import FAISS
7
- from langchain.llms import HuggingFaceHub
 
 
8
  from langchain.chains import RetrievalQA
9
- from langchain.prompts import PromptTemplate
10
- import os
11
- import tempfile
12
- import datetime
13
-
14
- class PDFChatbotWithGradio:
15
- def __init__(self):
16
- self.vectorstore = None
17
- self.qa_chain = None
18
- self.embeddings = HuggingFaceEmbeddings(
19
- model_name="sentence-transformers/all-MiniLM-L6-v2"
20
- )
21
- self.is_ready = False
22
-
23
- def process_pdf(self, file_obj):
24
- """Process uploaded PDF file - fixed to handle Gradio File object"""
25
- try:
26
- if file_obj is None:
27
- return "Please select a PDF file first!"
28
-
29
- # Extract the file path from Gradio's NamedString object
30
- # Gradio File component returns an object with 'name' attribute
31
- file_path = file_obj.name
32
-
33
- # Load PDF using the file path
34
- loader = PyPDFLoader(file_path)
35
- documents = loader.load()
36
-
37
- if not documents:
38
- return "No content could be extracted from the PDF."
39
-
40
- # Split text into chunks
41
- text_splitter = RecursiveCharacterTextSplitter(
42
- chunk_size=1000,
43
- chunk_overlap=200
44
- )
45
-
46
- chunks = text_splitter.split_documents(documents)
47
-
48
- # Create vector store
49
- self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
50
-
51
- # Setup QA chain
52
- self.setup_qa_chain()
53
-
54
- self.is_ready = True
55
- return f"PDF processed successfully! Loaded {len(documents)} pages and created {len(chunks)} chunks."
56
-
57
- except Exception as e:
58
- return f"Error processing PDF: {str(e)}"
59
-
60
- def setup_qa_chain(self):
61
- """Set up the question-answering chain"""
62
- # Initialize the language model
63
- llm = HuggingFaceHub(
64
- repo_id="google/flan-t5-small",
65
- model_kwargs={"temperature": 0.1, "max_length": 512}
66
- )
67
-
68
- # Custom prompt template
69
- prompt_template = """You are a helpful assistant that answers questions based on the provided context.
70
-
71
- Context: {context}
72
-
73
- Question: {question}
74
-
75
- Please provide a clear and concise answer based on the context above.
76
- If the answer cannot be found in the context, say "I don't know based on the document."
77
-
78
- Answer: """
79
-
80
- PROMPT = PromptTemplate(
81
- template=prompt_template,
82
- input_variables=["context", "question"]
83
- )
84
-
85
- # Create retrieval QA chain
86
- self.qa_chain = RetrievalQA.from_chain_type(
87
- llm=llm,
88
- chain_type="stuff",
89
- retriever=self.vectorstore.as_retriever(search_kwargs={"k": 3}),
90
- chain_type_kwargs={"prompt": PROMPT},
91
- return_source_documents=True
92
- )
93
-
94
- def ask_question(self, question, history):
95
- """Ask a question and get answer from the chatbot"""
96
- if not self.is_ready:
97
- return "Please upload and process a PDF first!", history
98
-
99
- if not question.strip():
100
- return "", history
101
-
102
- try:
103
- result = self.qa_chain({"query": question})
104
- answer = result["result"]
105
-
106
- # Format response with sources
107
- response = f"{answer}\n\n**Sources:**"
108
-
109
- for i, doc in enumerate(result["source_documents"][:2]):
110
- page_num = doc.metadata.get('page', 'N/A')
111
- if isinstance(page_num, int):
112
- page_num += 1 # Convert to 1-indexed for user readability
113
- content_preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
114
- response += f"\n{i+1}. Page {page_num}: {content_preview}"
115
-
116
- # Update chat history
117
- history.append((question, response))
118
- return "", history
119
-
120
- except Exception as e:
121
- error_msg = f"Error: {str(e)}"
122
- history.append((question, error_msg))
123
- return "", history
124
-
125
- # Create chatbot instance
126
- chatbot = PDFChatbotWithGradio()
127
-
128
- # Create Gradio interface
129
- with gr.Blocks(title="PDF Chatbot Agent", theme=gr.themes.Soft()) as demo:
130
- gr.Markdown("# πŸ“„ PDF Chatbot Agent")
131
- gr.Markdown("Upload a PDF document and ask questions about its content!")
132
-
133
- with gr.Row():
134
- with gr.Column(scale=1):
135
- pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
136
- upload_status = gr.Textbox(label="Upload Status", interactive=False)
137
- process_btn = gr.Button("Process PDF", variant="primary")
138
-
139
- with gr.Column(scale=2):
140
- chatbot_interface = gr.Chatbot(label="Chat", height=400)
141
- question_input = gr.Textbox(label="Your Question", placeholder="Ask a question about the PDF...")
142
- with gr.Row():
143
- submit_btn = gr.Button("Ask Question")
144
- clear_btn = gr.Button("Clear Chat")
145
-
146
- # Event handlers
147
- process_btn.click(
148
- fn=chatbot.process_pdf,
149
- inputs=pdf_upload,
150
- outputs=upload_status
151
- )
152
-
153
- def ask_and_clear(question, history):
154
- return chatbot.ask_question(question, history)
155
-
156
- submit_btn.click(
157
- fn=ask_and_clear,
158
- inputs=[question_input, chatbot_interface],
159
- outputs=[question_input, chatbot_interface]
160
- )
161
-
162
- question_input.submit(
163
- fn=ask_and_clear,
164
- inputs=[question_input, chatbot_interface],
165
- outputs=[question_input, chatbot_interface]
166
- )
167
-
168
- clear_btn.click(
169
- fn=lambda: [],
170
- inputs=[],
171
- outputs=chatbot_interface
172
- )
173
-
174
- gr.Examples(
175
- examples=[
176
- "What is the main topic of this document?",
177
- "Can you summarize the key points?",
178
- "What are the main conclusions?",
179
- "List the important findings mentioned."
180
- ],
181
- inputs=question_input
182
  )
183
 
184
- # Launch the application
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  if __name__ == "__main__":
186
- demo.launch(share=True)
 
 
 
1
+
2
  # app.py
3
+ import os
4
  import gradio as gr
5
+
 
 
6
  from langchain.vectorstores import FAISS
7
+ from langchain.embeddings import HuggingFaceEmbeddings
8
+ from langchain.document_loaders import TextLoader
9
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
10
  from langchain.chains import RetrievalQA
11
+ from langchain.llms import HuggingFacePipeline
12
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
13
+ from langchain.document_loaders import PyPDFLoader
14
+
15
+ # Optional: Set HF Token if needed
16
+ # os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'hf_XXXX'
17
+
18
+ # Initialize embedding model
19
+ embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
20
+
21
+ # Load HF model (lightweight for CPU)
22
+ model_name = "google/flan-t5-small"
23
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
24
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
25
+
26
+ # Wrap in pipeline
27
+ pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
28
+ llm = HuggingFacePipeline(pipeline=pipe)
29
+
30
+ def process_file(file_path):
31
+ # Load & split document
32
+ #loader = TextLoader(file_path)
33
+ loader = PyPDFLoader(file_path)
34
+ documents = loader.load()
35
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
36
+ docs = text_splitter.split_documents(documents)
37
+
38
+ # Create vector DB
39
+ vector_db = FAISS.from_documents(docs, embedding_model)
40
+ retriever = vector_db.as_retriever()
41
+
42
+ # Setup RetrievalQA chain
43
+ qa_chain = RetrievalQA.from_chain_type(
44
+ llm=llm,
45
+ chain_type="stuff",
46
+ retriever=retriever
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  )
48
 
49
+ return qa_chain
50
+
51
+ # Store the QA chain globally (across UI events)
52
+ qa_chain = None
53
+
54
+ def upload_and_prepare(file):
55
+ global qa_chain
56
+ # qa_chain = process_file(file)
57
+ qa_chain = process_file(file.name)
58
+ return "βœ… Document processed. You can now ask questions!"
59
+
60
+ def ask_question(query):
61
+ if not qa_chain:
62
+ return "❌ Please upload a document first."
63
+ response = qa_chain.invoke({"query": query})
64
+ return response["result"]
65
+
66
+ # Gradio UI
67
+ with gr.Blocks() as demo:
68
+ gr.Markdown("## 🧠 Ask Questions About Your Document (LangChain + Hugging Face)")
69
+
70
+ with gr.Row():
71
+ file_input = gr.File(label="πŸ“„ Upload .txt File", type="filepath")
72
+ upload_btn = gr.Button("πŸ”„ Process Document")
73
+
74
+ upload_output = gr.Textbox(label="πŸ“ Status", interactive=False)
75
+
76
+ with gr.Row():
77
+ query_input = gr.Textbox(label="❓ Your Question")
78
+ query_btn = gr.Button("🧠 Get Answer")
79
+
80
+ answer_output = gr.Textbox(label="βœ… Answer", lines=4)
81
+
82
+ upload_btn.click(upload_and_prepare, inputs=file_input, outputs=upload_output)
83
+ query_btn.click(ask_question, inputs=query_input, outputs=answer_output)
84
+
85
+ # For local dev use: demo.launch()
86
+ # For HF Spaces
87
  if __name__ == "__main__":
88
+ demo.launch()
89
+
90
+