superbsaeed commited on
Commit
d13d41c
Β·
verified Β·
1 Parent(s): 51bb158

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -90
app.py CHANGED
@@ -1,114 +1,198 @@
1
  import os
2
  import gradio as gr
3
- from pypdf import PdfReader
4
- from groq import Groq
5
-
6
- def extract_text_from_pdf(pdf_path):
7
- reader = PdfReader(pdf_path)
8
- pages = []
9
- for i, page in enumerate(reader.pages):
10
- text = page.extract_text()
11
- if text and text.strip():
12
- pages.append(f"[Page {i+1}]\n{text.strip()}")
13
- return "\n\n".join(pages) if pages else None
14
-
15
- def truncate_text(text, max_chars=6000):
16
- if len(text) <= max_chars:
17
- return text
18
- return text[:4000] + "\n\n...[middle truncated]...\n\n" + text[-2000:]
19
-
20
- def ask_groq(pdf_text, question, model, history):
21
- client = Groq(api_key=os.environ["GROQ_API_KEY"])
22
- safe_pdf_text = truncate_text(pdf_text)
23
-
24
- messages = [{
25
- "role": "system",
26
- "content": (
27
- "You are a helpful assistant. Answer questions ONLY from the provided PDF. "
28
- "If the answer is not in the document, say so. Mention page numbers when possible."
29
- )
30
- }]
 
 
 
 
 
 
 
 
31
 
32
- for user_msg, bot_msg in history:
33
- messages.append({"role": "user", "content": user_msg})
34
- messages.append({"role": "assistant", "content": bot_msg})
35
 
36
- if not history:
37
- content = f"PDF Content:\n\n{safe_pdf_text}\n\n---\nQuestion: {question}"
38
- else:
39
- content = question
40
 
41
- messages.append({"role": "user", "content": content})
42
 
43
- response = client.chat.completions.create(
44
- model=model,
45
- messages=messages,
46
- temperature=0.2,
47
- max_tokens=1024,
48
  )
49
- return response.choices[0].message.content
50
 
51
- def load_pdf(pdf_file, state):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  if pdf_file is None:
53
- return state, "⚠️ Please upload a PDF.", gr.update(interactive=False)
54
- pdf_text = extract_text_from_pdf(pdf_file)
55
- if not pdf_text:
56
- return state, "❌ Could not extract text. Try a non-scanned PDF.", gr.update(interactive=False)
57
- state["pdf_text"] = pdf_text
58
- state["history"] = []
59
- pages = pdf_text.count("[Page ")
60
- words = len(pdf_text.split())
61
- return state, f"βœ… Loaded! {pages} page(s), ~{words:,} words.\nNow ask your questions!", gr.update(interactive=True)
62
-
63
- def ask_question(question, model, state, chat_history):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  if not question.strip():
65
- return chat_history, state, ""
66
- if not state.get("pdf_text"):
67
- chat_history.append((question, "⚠️ Please load a PDF first."))
68
- return chat_history, state, ""
69
  try:
70
- answer = ask_groq(state["pdf_text"], question, model, state["history"])
71
- state["history"].append((question, answer))
72
- chat_history.append((question, answer))
 
 
 
 
 
 
 
73
  except Exception as e:
74
- chat_history.append((question, f"❌ Error: {e}"))
75
- return chat_history, state, ""
76
 
77
- def clear_chat(state):
78
- state["history"] = []
79
- return [], state
80
 
81
- MODELS = [
82
- "llama-3.3-70b-versatile",
83
- "llama-3.1-8b-instant",
84
- "llama3-70b-8192",
85
- "gemma2-9b-it",
86
- ]
87
 
88
- with gr.Blocks(title="PDF Q&A β€” Groq", theme=gr.themes.Soft()) as demo:
 
 
 
 
 
89
 
90
- state = gr.State({"pdf_text": "", "history": []})
 
91
 
92
- gr.Markdown("# πŸ“„ PDF Q&A App\n### Upload a PDF β†’ Ask questions β†’ Get AI answers")
 
 
 
93
 
94
  with gr.Row():
95
  with gr.Column(scale=1):
96
- pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], type="filepath")
97
- upload_btn = gr.Button("πŸ“€ Load PDF", variant="primary", size="lg")
98
- status = gr.Textbox(label="Status", value="Upload a PDF to begin...", lines=3, interactive=False)
99
- model_drop = gr.Dropdown(choices=MODELS, value=MODELS[0], label="πŸ€– Groq Model")
 
 
 
 
 
 
 
 
100
 
101
  with gr.Column(scale=2):
102
- chatbot = gr.Chatbot(label="Conversation", height=420)
 
103
  with gr.Row():
104
- q_input = gr.Textbox(label="Your Question", placeholder="e.g. Summarize this document", lines=2, interactive=False, scale=4)
105
- with gr.Column(scale=1):
106
- ask_btn = gr.Button("πŸ” Ask", variant="primary")
107
- clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
108
-
109
- upload_btn.click(load_pdf, [pdf_input, state], [state, status, q_input])
110
- ask_btn.click(ask_question, [q_input, model_drop, state, chatbot], [chatbot, state, q_input])
111
- q_input.submit(ask_question, [q_input, model_drop, state, chatbot], [chatbot, state, q_input])
112
- clear_btn.click(clear_chat, [state], [chatbot, state])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- demo.launch() # ← No share=True needed on Hugging Face
 
1
  import os
2
  import gradio as gr
3
+ from langchain_community.document_loaders import PyPDFLoader
4
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_groq import ChatGroq
8
+ from langchain_core.prompts import PromptTemplate
9
+ from langchain_core.output_parsers import StrOutputParser
10
+ from langchain_core.runnables import RunnableLambda
11
+
12
+ # ── Global state ──────────────────────────────────────────────
13
+ vectorstore = None
14
+ qa_chain = None
15
+ retrieved_docs = {}
16
+
17
+ # ── Embedding model (loaded once) ─────────────────────────────
18
+ embeddings = HuggingFaceEmbeddings(
19
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
20
+ model_kwargs={"device": "cpu"},
21
+ encode_kwargs={"normalize_embeddings": True}
22
+ )
23
+
24
+ # ── Helpers ───────────────────────────────────────────────────
25
+ def format_docs(docs):
26
+ return "\n\n".join(
27
+ f"[Page {doc.metadata.get('page', '?') + 1}]\n{doc.page_content}"
28
+ for doc in docs
29
+ )
30
+
31
+ def build_chain(groq_api_key: str):
32
+ """Build the LCEL RAG chain with the provided Groq key."""
33
+ llm = ChatGroq(
34
+ api_key=groq_api_key,
35
+ model="llama-3.3-70b-versatile",
36
+ temperature=0.2,
37
+ max_tokens=1024,
38
+ )
39
 
40
+ PROMPT = PromptTemplate(
41
+ template="""You are a helpful assistant. Use the context below to answer the question.
42
+ If the answer is not in the context, say "I don't have enough information to answer that."
43
 
44
+ Context:
45
+ {context}
 
 
46
 
47
+ Question: {question}
48
 
49
+ Answer:""",
50
+ input_variables=["context", "question"]
 
 
 
51
  )
 
52
 
53
+ retriever = vectorstore.as_retriever(
54
+ search_type="similarity",
55
+ search_kwargs={"k": 4}
56
+ )
57
+
58
+ def retrieve_and_format(input_dict):
59
+ question = input_dict["question"]
60
+ docs = retriever.invoke(question)
61
+ retrieved_docs["docs"] = docs
62
+ return {
63
+ "context": format_docs(docs),
64
+ "question": question
65
+ }
66
+
67
+ chain = (
68
+ RunnableLambda(retrieve_and_format)
69
+ | PROMPT
70
+ | llm
71
+ | StrOutputParser()
72
+ )
73
+ return chain
74
+
75
+ # ── Core functions ────────────────────────────────────────────
76
+ def process_pdf(pdf_file, groq_api_key, progress=gr.Progress()):
77
+ global vectorstore, qa_chain
78
+
79
  if pdf_file is None:
80
+ return "⚠️ Please upload a PDF file."
81
+ if not groq_api_key.strip():
82
+ return "⚠️ Please enter your Groq API key."
83
+
84
+ try:
85
+ progress(0.1, desc="Loading PDF...")
86
+ loader = PyPDFLoader(pdf_file.name)
87
+ pages = loader.load()
88
+
89
+ progress(0.3, desc="Splitting into chunks...")
90
+ splitter = RecursiveCharacterTextSplitter(
91
+ chunk_size=800,
92
+ chunk_overlap=100,
93
+ separators=["\n\n", "\n", ".", " "]
94
+ )
95
+ chunks = splitter.split_documents(pages)
96
+
97
+ progress(0.6, desc="Building FAISS index...")
98
+ vectorstore = FAISS.from_documents(chunks, embeddings)
99
+
100
+ progress(0.9, desc="Setting up RAG chain...")
101
+ qa_chain = build_chain(groq_api_key.strip())
102
+
103
+ progress(1.0, desc="Done!")
104
+ return f"βœ… Ready! Loaded **{len(pages)} pages** β†’ **{len(chunks)} chunks**."
105
+
106
+ except Exception as e:
107
+ return f"❌ Error: {str(e)}"
108
+
109
+
110
+ def answer_question(question, history):
111
+ if vectorstore is None or qa_chain is None:
112
+ history.append((question, "⚠️ Please upload a PDF and enter your Groq API key first."))
113
+ return "", history
114
  if not question.strip():
115
+ return "", history
116
+
 
 
117
  try:
118
+ answer = qa_chain.invoke({"question": question})
119
+
120
+ docs = retrieved_docs.get("docs", [])
121
+ if docs:
122
+ pages = sorted(set(
123
+ doc.metadata.get("page", 0) + 1
124
+ for doc in docs
125
+ ))
126
+ answer += f"\n\nπŸ“„ *Sources: pages {pages}*"
127
+
128
  except Exception as e:
129
+ answer = f"❌ Error generating answer: {str(e)}"
 
130
 
131
+ history.append((question, answer))
132
+ return "", history
 
133
 
 
 
 
 
 
 
134
 
135
+ def clear_all():
136
+ global vectorstore, qa_chain, retrieved_docs
137
+ vectorstore = None
138
+ qa_chain = None
139
+ retrieved_docs = {}
140
+ return [], "", "πŸ—‘οΈ Cleared. Upload a new PDF to start again."
141
 
142
+ # ── Gradio UI ─────────────────────────────────────────────────
143
+ with gr.Blocks(title="PDF RAG Chatbot", theme=gr.themes.Soft()) as demo:
144
 
145
+ gr.Markdown("""
146
+ # πŸ“š PDF RAG Chatbot
147
+ Upload a PDF, enter your Groq API key, then ask questions about the document.
148
+ """)
149
 
150
  with gr.Row():
151
  with gr.Column(scale=1):
152
+ gr.Markdown("### βš™οΈ Setup")
153
+ groq_key_box = gr.Textbox(
154
+ label="Groq API Key",
155
+ placeholder="gsk_...",
156
+ type="password"
157
+ )
158
+ pdf_upload = gr.File(
159
+ label="Upload PDF",
160
+ file_types=[".pdf"]
161
+ )
162
+ process_btn = gr.Button("πŸ“₯ Process PDF", variant="primary")
163
+ status_box = gr.Markdown("*Upload a PDF to begin.*")
164
 
165
  with gr.Column(scale=2):
166
+ gr.Markdown("### πŸ’¬ Chat")
167
+ chatbot = gr.Chatbot(height=480, bubble_full_width=False)
168
  with gr.Row():
169
+ question_box = gr.Textbox(
170
+ placeholder="Ask a question about your PDF...",
171
+ show_label=False,
172
+ scale=4
173
+ )
174
+ submit_btn = gr.Button("Send", variant="primary", scale=1)
175
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Chat & Reset")
176
+
177
+ # ── Event handlers ─────────────────────────────────────────
178
+ process_btn.click(
179
+ process_pdf,
180
+ inputs=[pdf_upload, groq_key_box],
181
+ outputs=[status_box]
182
+ )
183
+ submit_btn.click(
184
+ answer_question,
185
+ inputs=[question_box, chatbot],
186
+ outputs=[question_box, chatbot]
187
+ )
188
+ question_box.submit(
189
+ answer_question,
190
+ inputs=[question_box, chatbot],
191
+ outputs=[question_box, chatbot]
192
+ )
193
+ clear_btn.click(
194
+ clear_all,
195
+ outputs=[chatbot, question_box, status_box]
196
+ )
197
 
198
+ demo.launch()