IW2025 commited on
Commit
e5b03ae
Β·
verified Β·
1 Parent(s): 9fc11fb

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Slides/Copy[[:space:]]of[[:space:]]Week[[:space:]]4[[:space:]]Lesson[[:space:]]2.pptx[[:space:]](1).pdf filter=lfs diff=lfs merge=lfs -text
37
+ Slides/Copy[[:space:]]of[[:space:]]Week[[:space:]]4[[:space:]]Lesson.pptx[[:space:]](2).pdf filter=lfs diff=lfs merge=lfs -text
38
+ Slides/Copy[[:space:]]of[[:space:]]week[[:space:]]5[[:space:]]lesson.pptx.pdf filter=lfs diff=lfs merge=lfs -text
39
+ Slides/Copy[[:space:]]of[[:space:]]Week[[:space:]]6[[:space:]]lesson.pptx[[:space:]](1).pdf filter=lfs diff=lfs merge=lfs -text
40
+ Slides/Copy[[:space:]]of[[:space:]]Week[[:space:]]7[[:space:]]lesson.pptx.pdf filter=lfs diff=lfs merge=lfs -text
41
+ Slides/Sreekar[[:space:]]-[[:space:]]week[[:space:]]5[[:space:]]lesson.pptx.pdf filter=lfs diff=lfs merge=lfs -text
Slides/.DS_Store ADDED
Binary file (6.15 kB). View file
 
Slides/Copy of Week 4 Lesson 2.pptx (1).pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:599121f746db2f8e9da2e96d83122f02e940fa49830e3404d5359054672eddb2
3
+ size 245349
Slides/Copy of Week 4 Lesson.pptx (2).pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a09f25bb816d3e73e84184e0aae715fd9b008d573a31ccc25769d696d1c1e21
3
+ size 307124
Slides/Copy of Week 6 lesson.pptx (1).pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ee7535c27d3c649a8ad7fbd8e3e9b362c92c4c5f50f797b0a08d89e140789dc
3
+ size 689156
Slides/Copy of Week 7 lesson.pptx.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f296602886643fec267981b85b5d4ce0a54c8cfde56aca42b80a9fbbe87e6004
3
+ size 316333
Slides/Copy of week 5 lesson.pptx.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98a2e6632af2ddedf1efad7ec386c2ca8ea6161bfc1d63390eed22f6ca4a9943
3
+ size 338567
Slides/Sreekar - week 5 lesson.pptx.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc94f5cfa4d28eece6bbf077dd62dfc92878724413cbaf2e37aa102d931235d9
3
+ size 338571
app.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from pathlib import Path
4
+ import fitz # PyMuPDF
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain.vectorstores import Chroma
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain.chains import RetrievalQA
9
+ from langchain.llms import HuggingFacePipeline
10
+ from langchain.prompts import PromptTemplate
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
12
+ import torch
13
+ from typing import List, Dict, Any
14
+ import re
15
+
16
+ class CurriculumAssistant:
17
+ def __init__(self):
18
+ self.vector_db = None
19
+ self.qa_chain = None
20
+ self.embeddings = None
21
+ self.llm = None
22
+ self.curriculum_docs = []
23
+ self.pdf_pages = {} # Store page-level information
24
+
25
+ def load_llm(self):
26
+ """Load the LLaMA 3.1 model from Hugging Face"""
27
+ try:
28
+ model_name = "microsoft/DialoGPT-medium"
29
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
30
+ model = AutoModelForCausalLM.from_pretrained(
31
+ model_name,
32
+ torch_dtype=torch.float16,
33
+ device_map="auto",
34
+ trust_remote_code=True
35
+ )
36
+
37
+ pipe = pipeline(
38
+ "text-generation",
39
+ model=model,
40
+ tokenizer=tokenizer,
41
+ max_new_tokens=256,
42
+ temperature=0.7,
43
+ top_p=0.95,
44
+ repetition_penalty=1.15
45
+ )
46
+
47
+ self.llm = HuggingFacePipeline(pipeline=pipe)
48
+ return True
49
+ except Exception as e:
50
+ print(f"Error loading model: {str(e)}")
51
+ return False
52
+
53
+ def extract_text_from_pdf_with_pages(self, pdf_path: str) -> Dict[int, str]:
54
+ """Extract text from PDF file with page numbers"""
55
+ try:
56
+ doc = fitz.open(pdf_path)
57
+ pages = {}
58
+ for page_num in range(len(doc)):
59
+ page = doc[page_num]
60
+ text = page.get_text()
61
+ if text.strip(): # Only store non-empty pages
62
+ pages[page_num + 1] = text.strip()
63
+ doc.close()
64
+ return pages
65
+ except Exception as e:
66
+ print(f"Error extracting text from {pdf_path}: {str(e)}")
67
+ return {}
68
+
69
+ def process_curriculum(self, slides_dir: str):
70
+ """Process all PDF files in the slides directory"""
71
+ try:
72
+ slides_path = Path(slides_dir)
73
+ pdf_files = list(slides_path.glob("*.pdf"))
74
+
75
+ if not pdf_files:
76
+ print("No PDF files found in the Slides directory!")
77
+ return False
78
+
79
+ all_texts = []
80
+ all_chunks_with_metadata = []
81
+
82
+ for pdf_file in pdf_files:
83
+ print(f"Processing: {pdf_file.name}")
84
+
85
+ # Extract text with page information
86
+ pages = self.extract_text_from_pdf_with_pages(str(pdf_file))
87
+ self.pdf_pages[pdf_file.name] = pages
88
+
89
+ # Combine all pages for vector database
90
+ full_text = "\n\n".join([f"Page {page_num}: {text}" for page_num, text in pages.items()])
91
+
92
+ if full_text:
93
+ all_texts.append(full_text)
94
+ self.curriculum_docs.append({
95
+ 'filename': pdf_file.name,
96
+ 'content': full_text[:500] + "..." if len(full_text) > 500 else full_text,
97
+ 'pages': pages
98
+ })
99
+
100
+ if not all_texts:
101
+ print("No text could be extracted from PDF files!")
102
+ return False
103
+
104
+ # Split text into chunks with metadata
105
+ text_splitter = RecursiveCharacterTextSplitter(
106
+ chunk_size=1000,
107
+ chunk_overlap=200,
108
+ length_function=len,
109
+ )
110
+
111
+ for i, text in enumerate(all_texts):
112
+ chunks = text_splitter.split_text(text)
113
+ for j, chunk in enumerate(chunks):
114
+ # Add metadata to track which document and approximate page
115
+ all_chunks_with_metadata.append({
116
+ 'text': chunk,
117
+ 'metadata': {
118
+ 'filename': pdf_files[i].name,
119
+ 'chunk_id': j,
120
+ 'source': 'curriculum'
121
+ }
122
+ })
123
+
124
+ # Create embeddings
125
+ self.embeddings = HuggingFaceEmbeddings(
126
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
127
+ )
128
+
129
+ # Create vector database with metadata
130
+ texts = [chunk['text'] for chunk in all_chunks_with_metadata]
131
+ metadatas = [chunk['metadata'] for chunk in all_chunks_with_metadata]
132
+
133
+ self.vector_db = Chroma.from_texts(
134
+ texts=texts,
135
+ embedding=self.embeddings,
136
+ metadatas=metadatas,
137
+ persist_directory="./chroma_db"
138
+ )
139
+
140
+ print(f"Processed {len(pdf_files)} curriculum documents!")
141
+ return True
142
+
143
+ except Exception as e:
144
+ print(f"Error processing curriculum: {str(e)}")
145
+ return False
146
+
147
+ def create_qa_chain(self):
148
+ """Create the QA chain with custom prompts"""
149
+ if not self.vector_db or not self.llm:
150
+ return False
151
+
152
+ # Custom prompt template for Q&A
153
+ qa_template = """You are an expert programming instructor for the Inclusive World Curriculum.
154
+ Use the following context to answer the student's question. If the information is not in the context,
155
+ provide a helpful response based on your knowledge of programming concepts.
156
+
157
+ Context: {context}
158
+
159
+ Question: {question}
160
+
161
+ Answer:"""
162
+
163
+ self.qa_chain = RetrievalQA.from_chain_type(
164
+ llm=self.llm,
165
+ chain_type="stuff",
166
+ retriever=self.vector_db.as_retriever(search_kwargs={"k": 5}),
167
+ chain_type_kwargs={
168
+ "prompt": PromptTemplate(
169
+ template=qa_template,
170
+ input_variables=["context", "question"]
171
+ )
172
+ }
173
+ )
174
+
175
+ return True
176
+
177
+ def find_relevant_pages(self, question: str, filename: str = None) -> List[Dict]:
178
+ """Find relevant pages for a given question"""
179
+ try:
180
+ # Search for relevant chunks
181
+ results = self.vector_db.similarity_search(question, k=5)
182
+
183
+ relevant_pages = []
184
+ seen_pages = set()
185
+
186
+ for result in results:
187
+ metadata = result.metadata
188
+ doc_filename = metadata.get('filename', '')
189
+
190
+ # If filename is specified, only look in that file
191
+ if filename and doc_filename != filename:
192
+ continue
193
+
194
+ # Extract page information from chunk text
195
+ chunk_text = result.page_content
196
+
197
+ # Look for page numbers in the chunk
198
+ page_matches = re.findall(r'Page (\d+):', chunk_text)
199
+
200
+ for page_num in page_matches:
201
+ page_key = f"{doc_filename}_page_{page_num}"
202
+ if page_key not in seen_pages:
203
+ seen_pages.add(page_key)
204
+
205
+ # Get the actual page content
206
+ if doc_filename in self.pdf_pages:
207
+ page_content = self.pdf_pages[doc_filename].get(int(page_num), "")
208
+ if page_content:
209
+ relevant_pages.append({
210
+ 'filename': doc_filename,
211
+ 'page_number': int(page_num),
212
+ 'content': page_content,
213
+ 'relevance_score': len(chunk_text) # Simple relevance metric
214
+ })
215
+
216
+ # Sort by relevance and return top results
217
+ relevant_pages.sort(key=lambda x: x['relevance_score'], reverse=True)
218
+ return relevant_pages[:3] # Return top 3 most relevant pages
219
+
220
+ except Exception as e:
221
+ print(f"Error finding relevant pages: {str(e)}")
222
+ return []
223
+
224
+ def initialize_system():
225
+ """Initialize the curriculum assistant system"""
226
+ assistant = CurriculumAssistant()
227
+
228
+ # Load LLM
229
+ if not assistant.load_llm():
230
+ return "❌ Failed to load language model", None, None
231
+
232
+ # Process curriculum
233
+ if not assistant.process_curriculum("Slides"):
234
+ return "❌ Failed to process curriculum documents", None, None
235
+
236
+ # Create QA chain
237
+ if not assistant.create_qa_chain():
238
+ return "❌ Failed to create QA chain", None, None
239
+
240
+ return "βœ… System initialized successfully!", assistant, assistant.curriculum_docs
241
+
242
+ def ask_question(question: str, assistant: CurriculumAssistant):
243
+ """Ask a question and get answer with relevant pages"""
244
+ if not assistant or not assistant.qa_chain:
245
+ return "Please initialize the system first.", "", ""
246
+
247
+ try:
248
+ # Get answer from QA chain
249
+ answer = assistant.qa_chain.run(question)
250
+
251
+ # Find relevant pages
252
+ relevant_pages = assistant.find_relevant_pages(question)
253
+
254
+ # Format page information
255
+ page_info = ""
256
+ if relevant_pages:
257
+ page_info = "πŸ“„ **Relevant Pages Found:**\n\n"
258
+ for i, page in enumerate(relevant_pages, 1):
259
+ page_info += f"**{i}. {page['filename']} - Page {page['page_number']}**\n"
260
+ page_info += f"```\n{page['content'][:300]}...\n```\n\n"
261
+ else:
262
+ page_info = "No specific pages found for this question."
263
+
264
+ # Format the complete response
265
+ full_response = f"## Answer\n\n{answer}\n\n---\n\n{page_info}"
266
+
267
+ return full_response, answer, page_info
268
+
269
+ except Exception as e:
270
+ error_msg = f"Error processing question: {str(e)}"
271
+ return error_msg, "", ""
272
+
273
+ # Initialize the system
274
+ status, assistant, curriculum_docs = initialize_system()
275
+
276
+ # Create Gradio interface
277
+ with gr.Blocks(title="Inclusive World Curriculum Assistant", theme=gr.themes.Soft()) as demo:
278
+ gr.Markdown("# πŸŽ“ Inclusive World Curriculum Assistant")
279
+ gr.Markdown("An AI-powered assistant that answers questions about your curriculum and shows relevant slide pages.")
280
+
281
+ with gr.Row():
282
+ with gr.Column(scale=2):
283
+ # Status display
284
+ status_display = gr.Textbox(
285
+ value=status,
286
+ label="System Status",
287
+ interactive=False
288
+ )
289
+
290
+ # Question input
291
+ question_input = gr.Textbox(
292
+ label="Ask a question about your curriculum",
293
+ placeholder="e.g., What are if statements? How do loops work?",
294
+ lines=3
295
+ )
296
+
297
+ # Submit button
298
+ submit_btn = gr.Button("πŸ” Get Answer", variant="primary")
299
+
300
+ # Answer output
301
+ answer_output = gr.Markdown(
302
+ label="Answer with Relevant Pages",
303
+ value="Ask a question to get started!"
304
+ )
305
+
306
+ with gr.Column(scale=1):
307
+ # Curriculum overview
308
+ gr.Markdown("### πŸ“š Curriculum Documents")
309
+ if curriculum_docs:
310
+ for doc in curriculum_docs:
311
+ with gr.Accordion(f"πŸ“„ {doc['filename']}", open=False):
312
+ gr.Markdown(f"**Preview:** {doc['content']}")
313
+ else:
314
+ gr.Markdown("No curriculum documents loaded.")
315
+
316
+ # Handle question submission
317
+ def process_question(question):
318
+ return ask_question(question, assistant)
319
+
320
+ submit_btn.click(
321
+ fn=process_question,
322
+ inputs=[question_input],
323
+ outputs=[answer_output]
324
+ )
325
+
326
+ # Handle Enter key in question input
327
+ question_input.submit(
328
+ fn=process_question,
329
+ inputs=[question_input],
330
+ outputs=[answer_output]
331
+ )
332
+
333
+ # Launch the app
334
+ if __name__ == "__main__":
335
+ demo.launch(share=True)
app_config.toml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build]
2
+ python_version = "3.11"
3
+
4
+ [env]
5
+ HF_HUB_ENABLE_HF_TRANSFER = "1"
6
+ TRANSFORMERS_CACHE = "/tmp/transformers_cache"
7
+ HF_HOME = "/tmp/hf_home"
8
+
9
+ [system_packages]
10
+ # Add any system packages if needed
11
+
12
+ [models]
13
+ # Preload models for faster startup
14
+ "microsoft/DialoGPT-medium" = "dialo-medium"
15
+ "sentence-transformers/all-MiniLM-L6-v2" = "all-minilm-l6-v2"
16
+
17
+ [datasets]
18
+ # Add any datasets if needed
19
+
20
+ [hardware]
21
+ # Hardware requirements for Gradio
22
+ cpu = "2"
23
+ memory = "8GB"
24
+ disk = "10GB"
25
+
26
+ [gradio]
27
+ # Gradio specific settings
28
+ title = "Inclusive World Curriculum Assistant"
29
+ description = "AI-powered assistant that answers questions about curriculum and shows relevant slide pages"
30
+ theme = "soft"
31
+ share = false
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.44.0
2
+ langchain==0.3.26
3
+ langchain-community==0.3.27
4
+ chromadb==1.0.15
5
+ sentence-transformers==5.0.0
6
+ transformers==4.35.2
7
+ torch==2.0.1
8
+ PyMuPDF==1.23.8
9
+ accelerate==0.24.1
10
+ huggingface-hub==0.19.4
11
+ numpy==1.24.3
12
+ pandas==2.0.3
13
+ scikit-learn==1.3.0
14
+ tiktoken==0.5.1