IW2025 commited on
Commit
764f397
Β·
verified Β·
1 Parent(s): 32ddee3

Upload llm_app.py

Browse files
Files changed (1) hide show
  1. llm_app.py +361 -0
llm_app.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from pathlib import Path
4
+ import fitz # PyMuPDF
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import Chroma
7
+ from langchain.prompts import PromptTemplate
8
+ from langchain.chains import LLMChain
9
+ import requests
10
+ import json
11
+ import base64
12
+ from PIL import Image
13
+ import io
14
+ import re
15
+ from dotenv import load_dotenv
16
+
17
+ # Load environment variables from .env file
18
+ load_dotenv()
19
+
20
+ # --- LLM-Powered Curriculum Assistant ---
21
+
22
+ class LLMCurriculumAssistant:
23
+ def __init__(self, slides_dir="Slides"):
24
+ self.pdf_pages = {} # {filename: {page_num: text}}
25
+ self.pdf_files = {} # {filename: path}
26
+ self.chunks = []
27
+ self.chunk_metadata = []
28
+ self.vector_db = None
29
+ self.embeddings = None
30
+ self.llm = None
31
+ self.content_selection_chain = None
32
+ self.answer_chain = None
33
+
34
+ # Setup
35
+ self._process_pdfs(slides_dir)
36
+ self._build_vector_db()
37
+ self._setup_llm()
38
+
39
+ def _process_pdfs(self, slides_dir):
40
+ """Process PDFs and extract text"""
41
+ slides_path = Path(slides_dir)
42
+ pdf_files = list(slides_path.glob("*.pdf"))
43
+
44
+ for pdf_file in pdf_files:
45
+ self.pdf_files[pdf_file.name] = str(pdf_file)
46
+ doc = fitz.open(str(pdf_file))
47
+ pages = {}
48
+
49
+ for page_num in range(len(doc)):
50
+ page = doc[page_num]
51
+ text = page.get_text()
52
+ if text.strip():
53
+ pages[page_num + 1] = text.strip()
54
+
55
+ self.pdf_pages[pdf_file.name] = pages
56
+ doc.close()
57
+
58
+ # Add each page as a chunk
59
+ for page_num, text in pages.items():
60
+ self.chunks.append(text)
61
+ self.chunk_metadata.append({
62
+ "filename": pdf_file.name,
63
+ "page_number": page_num
64
+ })
65
+
66
+ print(f"βœ… Processed {len(pdf_files)} PDF files with {len(self.chunks)} total pages")
67
+
68
+ def _build_vector_db(self):
69
+ """Build vector database for semantic search"""
70
+ self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
71
+ self.vector_db = Chroma.from_texts(
72
+ texts=self.chunks,
73
+ embedding=self.embeddings,
74
+ metadatas=self.chunk_metadata,
75
+ persist_directory="./chroma_db"
76
+ )
77
+ print("βœ… Vector database built successfully")
78
+
79
+ def _setup_llm(self):
80
+ """Setup DeepSeek LLM"""
81
+ try:
82
+ # Initialize DeepSeek client
83
+ self.deepseek_api_key = os.environ.get("DEEPSEEK_API_KEY")
84
+ self.deepseek_base_url = "https://api.deepseek.com/v1/chat/completions"
85
+
86
+ # Create content selection prompt
87
+ content_selection_template = """Hi! I'm helping a student find the best curriculum slide for their question.
88
+
89
+ The student asked: "{question}"
90
+
91
+ Here are some slides that might be relevant:
92
+ {slide_contents}
93
+
94
+ Could you help me pick the slide that best answers their specific question? Look for:
95
+ - Slides that specifically mention what they're asking about
96
+ - Slides with clear explanations and examples
97
+ - Slides that match the exact terms they used (like "for loops" vs just "loops")
98
+
99
+ Just respond with the slide number (1, 2, 3, etc.) that you think is most helpful. If none really fit, say "0".
100
+
101
+ Thanks! Slide number:"""
102
+
103
+ self.content_selection_prompt = PromptTemplate(
104
+ input_variables=["question", "slide_contents"],
105
+ template=content_selection_template
106
+ )
107
+
108
+ # Create answer generation prompt
109
+ answer_template = """Hey there! I'm helping a student understand a programming concept. They asked:
110
+
111
+ "{question}"
112
+
113
+ Here's what the curriculum slide says about it:
114
+ {slide_content}
115
+
116
+ Could you help me explain this to them in a friendly, educational way? I'd like you to:
117
+ - Break it down in simple terms
118
+ - Use examples if the slide has them
119
+ - Make it step-by-step and easy to follow
120
+ - Add some helpful context if the slide is brief
121
+ - Use bullet points or lists to make it clear
122
+ - Make sure your answer directly addresses what they asked
123
+
124
+ Thanks for your help! Here's what I'd tell the student:"""
125
+
126
+ self.answer_prompt = PromptTemplate(
127
+ input_variables=["question", "slide_content"],
128
+ template=answer_template
129
+ )
130
+
131
+ print("βœ… LLM setup successful!")
132
+
133
+ except Exception as e:
134
+ print(f"❌ Error setting up LLM: {e}")
135
+ self.deepseek_api_key = None
136
+ self.content_selection_prompt = None
137
+ self.answer_prompt = None
138
+
139
+ def get_pdf_page_image(self, pdf_path, page_num):
140
+ """Get PDF page as image"""
141
+ try:
142
+ doc = fitz.open(pdf_path)
143
+ if page_num <= len(doc):
144
+ page = doc[page_num - 1]
145
+ mat = fitz.Matrix(1.5, 1.5)
146
+ pix = page.get_pixmap(matrix=mat)
147
+ img_data = pix.tobytes("png")
148
+ img = Image.open(io.BytesIO(img_data))
149
+ if img.mode != 'RGB':
150
+ img = img.convert('RGB')
151
+ doc.close()
152
+ return img
153
+ doc.close()
154
+ return None
155
+ except Exception as e:
156
+ print(f"Error rendering PDF page: {str(e)}")
157
+ return None
158
+
159
+ def chat(self, query):
160
+ """Main chat function with LLM-powered content selection and answer generation"""
161
+ print(f"\nπŸ” Processing query: {query}")
162
+
163
+ # Step 1: Vector search to find relevant content
164
+ results = self.vector_db.similarity_search(query, k=5)
165
+
166
+ if not results:
167
+ return "I couldn't find any relevant content in the curriculum for your question.", [], None, None
168
+
169
+ print(f"πŸ“š Found {len(results)} relevant slides from vector search")
170
+
171
+ # Step 2: LLM content selection
172
+ selected_content = None
173
+ selected_result = None
174
+
175
+ if self.deepseek_api_key and self.content_selection_prompt:
176
+ try:
177
+ # Prepare slide contents for LLM analysis
178
+ slide_contents = []
179
+ for i, result in enumerate(results):
180
+ filename = result.metadata['filename']
181
+ page_num = result.metadata['page_number']
182
+ content = result.page_content[:800]
183
+ slide_contents.append(f"Slide {i+1} ({filename} - Page {page_num}):\n{content}")
184
+
185
+ slide_contents_text = "\n\n".join(slide_contents)
186
+
187
+ print("πŸ€– Using DeepSeek to select most relevant content...")
188
+
189
+ # Format the prompt
190
+ prompt = self.content_selection_prompt.format(
191
+ question=query,
192
+ slide_contents=slide_contents_text
193
+ )
194
+
195
+ # Get DeepSeek's selection
196
+ headers = {
197
+ "Authorization": f"Bearer {self.deepseek_api_key}",
198
+ "Content-Type": "application/json"
199
+ }
200
+
201
+ data = {
202
+ "model": "deepseek-chat",
203
+ "messages": [{"role": "user", "content": prompt}],
204
+ "max_tokens": 1500,
205
+ "temperature": 0.7
206
+ }
207
+
208
+ response = requests.post(self.deepseek_base_url, headers=headers, json=data)
209
+ response.raise_for_status()
210
+
211
+ selection_response = response.json()["choices"][0]["message"]["content"]
212
+ print(f"DeepSeek Selection Response: {selection_response}")
213
+
214
+ # Parse the selection
215
+ try:
216
+ numbers = re.findall(r'\d+', selection_response)
217
+ if numbers:
218
+ selected_index = int(numbers[0]) - 1
219
+ if 0 <= selected_index < len(results):
220
+ selected_result = results[selected_index]
221
+ selected_content = selected_result.page_content
222
+ print(f"βœ… LLM selected slide {selected_index + 1}")
223
+ else:
224
+ print(f"⚠️ LLM selection out of range: {selected_index + 1}")
225
+ selected_result = results[0]
226
+ selected_content = selected_result.page_content
227
+ else:
228
+ print("⚠️ No number found in LLM response, using first result")
229
+ selected_result = results[0]
230
+ selected_content = selected_result.page_content
231
+
232
+ except Exception as e:
233
+ print(f"Error parsing LLM selection: {e}")
234
+ selected_result = results[0]
235
+ selected_content = selected_result.page_content
236
+
237
+ except Exception as e:
238
+ print(f"Error in LLM content selection: {e}")
239
+ selected_result = results[0]
240
+ selected_content = selected_result.page_content
241
+ else:
242
+ # Fallback to first result
243
+ selected_result = results[0]
244
+ selected_content = selected_result.page_content
245
+
246
+ # Step 3: LLM answer generation
247
+ answer = ""
248
+ if self.deepseek_api_key and self.answer_prompt and selected_content:
249
+ try:
250
+ print("πŸ€– Generating DeepSeek answer...")
251
+
252
+ # Format the prompt
253
+ prompt = self.answer_prompt.format(
254
+ question=query,
255
+ slide_content=selected_content
256
+ )
257
+
258
+ # Get DeepSeek's answer
259
+ headers = {
260
+ "Authorization": f"Bearer {self.deepseek_api_key}",
261
+ "Content-Type": "application/json"
262
+ }
263
+
264
+ data = {
265
+ "model": "deepseek-chat",
266
+ "messages": [{"role": "user", "content": prompt}],
267
+ "max_tokens": 1500,
268
+ "temperature": 0.7
269
+ }
270
+
271
+ response = requests.post(self.deepseek_base_url, headers=headers, json=data)
272
+ response.raise_for_status()
273
+
274
+ answer = response.json()["choices"][0]["message"]["content"].strip()
275
+ print(f"βœ… DeepSeek answer generated: {answer[:100]}...")
276
+
277
+ except Exception as e:
278
+ print(f"Error generating DeepSeek answer: {e}")
279
+ answer = f"Based on the curriculum slide:\n\n{selected_content}\n\nThis slide contains relevant information about your question."
280
+ else:
281
+ answer = f"Based on the curriculum slide:\n\n{selected_content}\n\nThis slide contains relevant information about your question."
282
+
283
+ # Step 4: Get relevant slides for display
284
+ relevant_slides = []
285
+ if selected_result:
286
+ filename = selected_result.metadata["filename"]
287
+ page_number = selected_result.metadata["page_number"]
288
+
289
+ if filename in self.pdf_files:
290
+ pdf_path = self.pdf_files[filename]
291
+ doc = fitz.open(pdf_path)
292
+ total_pages = len(doc)
293
+ doc.close()
294
+
295
+ # Get the selected page and neighboring pages
296
+ start_page = max(1, page_number - 2)
297
+ end_page = min(total_pages, page_number + 2)
298
+
299
+ for page_num in range(start_page, end_page + 1):
300
+ img = self.get_pdf_page_image(pdf_path, page_num)
301
+ if img:
302
+ if page_num == page_number:
303
+ label = f"πŸ“Œ {filename} - Page {page_num} (Most Relevant)"
304
+ else:
305
+ label = f"{filename} - Page {page_num}"
306
+ relevant_slides.append((img, label))
307
+
308
+ recommended_slide = relevant_slides[0][0] if relevant_slides else None
309
+ recommended_label = relevant_slides[0][1] if relevant_slides else None
310
+ else:
311
+ recommended_slide = None
312
+ recommended_label = None
313
+ else:
314
+ recommended_slide = None
315
+ recommended_label = None
316
+
317
+ return answer, relevant_slides, recommended_slide, recommended_label
318
+
319
+ # --- Gradio UI ---
320
+ assistant = LLMCurriculumAssistant()
321
+
322
+ def gradio_chat(query):
323
+ """Gradio chat interface"""
324
+ answer, relevant_slides, recommended_slide, recommended_label = assistant.chat(query)
325
+ return answer, relevant_slides
326
+
327
+ with gr.Blocks(title="LLM Curriculum Assistant", theme=gr.themes.Soft()) as demo:
328
+ gr.Markdown("# πŸ€– LLM Curriculum Assistant\nYour AI programming tutor with LLM-powered content selection and answers!")
329
+
330
+ with gr.Row():
331
+ # Left Column - Chatbot Interface
332
+ with gr.Column(scale=1):
333
+ gr.Markdown("### πŸ’¬ Chatbot")
334
+ gr.Markdown("**Ask questions about programming concepts:**")
335
+
336
+ question = gr.Textbox(
337
+ label="Question Input",
338
+ placeholder="e.g., What are for loops? How do variables work? Explain functions...",
339
+ lines=3
340
+ )
341
+ submit = gr.Button("πŸ€– Ask AI", variant="primary", size="lg")
342
+ answer = gr.Markdown(label="LLM Generated Answer")
343
+
344
+ # Right Column - Slides Display
345
+ with gr.Column(scale=1):
346
+ gr.Markdown("### πŸ“„ Most Relevant Slides")
347
+ gallery = gr.Gallery(
348
+ label="Curriculum Slides",
349
+ columns=1,
350
+ rows=3,
351
+ height="600px",
352
+ object_fit="contain",
353
+ show_label=False
354
+ )
355
+
356
+ # Event handlers
357
+ submit.click(fn=gradio_chat, inputs=[question], outputs=[answer, gallery])
358
+ question.submit(fn=gradio_chat, inputs=[question], outputs=[answer, gallery])
359
+
360
+ if __name__ == "__main__":
361
+ demo.launch()