Files changed (1) hide show
  1. app.py +121 -165
app.py CHANGED
@@ -7,13 +7,32 @@ import threading
7
  import pathlib
8
  import os
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # --------------------------------------------------
11
  # FILE TEXT EXTRACTION
12
  # --------------------------------------------------
13
 
14
- SUPPORTED_EXT = (".pdf",".docx",".txt",".png",".jpg",".jpeg",".webp",".bmp",".tiff")
 
 
15
 
16
  def extract_text_from_file(filepath):
 
17
  if not filepath:
18
  return ""
19
 
@@ -23,6 +42,7 @@ def extract_text_from_file(filepath):
23
  ext = pathlib.Path(filepath).suffix.lower()
24
 
25
  try:
 
26
  if ext == ".pdf":
27
  doc = fitz.open(filepath)
28
  text = []
@@ -39,9 +59,11 @@ def extract_text_from_file(filepath):
39
  return f.read()
40
 
41
  elif ext in (".png",".jpg",".jpeg",".webp",".bmp",".tiff"):
 
42
  try:
43
  img = Image.open(filepath)
44
  return pytesseract.image_to_string(img)
 
45
  except Exception as e:
46
  return "OCR failed: " + str(e)
47
 
@@ -51,42 +73,34 @@ def extract_text_from_file(filepath):
51
  except Exception as e:
52
  return "Could not read file: " + str(e)
53
 
 
54
  # --------------------------------------------------
55
- # MODELS (verified HuggingFace IDs)
56
  # --------------------------------------------------
57
 
58
  MODELS = {
59
- # < 1GB: Quick slide summaries or vocab lists
60
  "Gemma 3 270M [0.6GB | Lightning-fast Edge]": "google/gemma-3-270m-it",
61
  "Qwen 3 0.6B GGUF [0.5GB | Classroom Assistant]": "Qwen/Qwen3-0.6B-GGUF",
62
  "TinyLlama 1.1B [0.5GB]": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
63
 
64
- # 1-3GB: Smart mobile & basic assistants
65
  "Qwen 3.5 2B [2.4GB | The Student Tutor]": "Qwen/Qwen3.5-2B",
66
  "Phi-4 Mini [1.8GB | Logical Powerhouse]": "microsoft/Phi-4-mini-instruct",
67
  "Gemma 3 1B [2.1GB | Stable & Coherent]": "google/gemma-3-1b-it",
68
 
69
- # 3-8GB: The "Daily Driver" sweet spot
70
  "Qwen 3.5 9B [7.8GB | BEST FOR LESSON PLANS]": "Qwen/Qwen3.5-9B",
71
  "Llama 3.1 8B [5.2GB | Industry Standard]": "meta-llama/Meta-Llama-3.1-8B-Instruct",
72
  "Mistral Small 3 [7.1GB | Concise & Accurate]": "mistralai/Mistral-Small-3-Instruct",
73
  "Gemma 3 9B [6.3GB | Creative & Safe]": "google/gemma-3-9b-it",
74
 
75
- # 8-12GB: Enhanced reasoning for complex curricula
76
- "Qwen 3.5 35B-A3B [11.5GB | Elite Pedagogy MoE]": "Qwen/Qwen3.5-35B-A3B",
77
  "Mistral Small 12B [9.5GB | Perfect VRAM Balance]": "mistralai/Mistral-Nemo-Instruct-2407",
78
 
79
- # 12-20GB: Professional grade logic
80
  "Qwen 3.5 27B [18GB | Dense Curriculum Architect]": "Qwen/Qwen3.5-27B",
81
- "DeepSeek V3 Lite 21B [16.0GB | Academic Beast]": "deepseek-ai/DeepSeek-V3-Lite",
82
-
83
- # > 20GB: The Frontier models
84
- "Qwen 3.5 397B-A17B [75GB+ | Full Textbook Author]": "Qwen/Qwen3.5-397B-A17B",
85
- "GPT-OSS 120B [72GB+ | SOTA Logic & Coding]": "openai/gpt-oss-120b"
86
  }
87
 
88
  ALL_MODEL_NAMES = list(MODELS.keys())
89
 
 
90
  # --------------------------------------------------
91
  # PIPELINE CACHE
92
  # --------------------------------------------------
@@ -94,25 +108,33 @@ ALL_MODEL_NAMES = list(MODELS.keys())
94
  _pipeline_cache = {}
95
  _pipeline_lock = threading.Lock()
96
 
 
97
  def get_pipeline(model_id, hf_token):
 
98
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
99
 
100
  with _pipeline_lock:
 
101
  if model_id not in _pipeline_cache:
 
102
  try:
103
- token = hf_token.strip() if hf_token else None
 
 
 
 
 
 
104
 
105
  tokenizer = AutoTokenizer.from_pretrained(
106
  model_id,
107
- token=token,
108
- trust_remote_code=True
109
  )
110
 
111
  model = AutoModelForCausalLM.from_pretrained(
112
  model_id,
113
- token=token,
114
- trust_remote_code=True,
115
- device_map="cpu"
116
  )
117
 
118
  pipe = pipeline(
@@ -128,21 +150,28 @@ def get_pipeline(model_id, hf_token):
128
 
129
  return _pipeline_cache[model_id], None
130
 
 
131
  # --------------------------------------------------
132
  # INFERENCE
133
  # --------------------------------------------------
134
 
135
  SYSTEM_MSG = "You are an expert educational assistant. Use markdown."
136
 
137
- def ask_llm(model_label, prompt, hf_token=os.getenv("hgface_tok")):
 
 
 
138
  model_id = MODELS[model_label]
139
- pipe,err = get_pipeline(model_id, hf_token)
 
140
 
141
  if err:
142
  return "Model load error:\n" + err
143
 
144
  try:
 
145
  combined = SYSTEM_MSG + "\n\n" + prompt
 
146
  out = pipe(
147
  combined,
148
  max_new_tokens=2048,
@@ -153,7 +182,6 @@ def ask_llm(model_label, prompt, hf_token=os.getenv("hgface_tok")):
153
 
154
  text = out[0]["generated_text"]
155
 
156
- # Remove prompt from output if echoed
157
  if text.startswith(combined):
158
  text = text[len(combined):]
159
 
@@ -162,19 +190,32 @@ def ask_llm(model_label, prompt, hf_token=os.getenv("hgface_tok")):
162
  except Exception as e:
163
  return "Inference error:\n" + str(e)
164
 
 
165
  # --------------------------------------------------
166
  # PROMPTS
167
  # --------------------------------------------------
168
 
169
  def make_prompts(topic):
 
170
  return {
171
- "lesson": "Create a lesson plan with headings and bullet points.\n\nTopic:\n"+topic,
172
- "qa": "Generate 10 exam questions with answers.\n\nTopic:\n"+topic,
173
- "mcq": "Generate 10 MCQs with 4 options and answers.\n\nTopic:\n"+topic,
174
- "summary": "Summarize the topic in 250-300 words.\n\nTopic:\n"+topic,
175
- "infographic": "Create a cheat sheet using tables and bullet points.\n\nTopic:\n"+topic
 
 
 
 
 
 
 
 
 
 
176
  }
177
 
 
178
  def generate_content(text, file, model_label, token):
179
  file_text = extract_text_from_file(file) if file else ""
180
 
@@ -185,175 +226,90 @@ def generate_content(text, file, model_label, token):
185
  return
186
 
187
  prompts = make_prompts(syllabus)
 
188
  WAIT = "Generating..."
189
- results = [WAIT, WAIT, WAIT, WAIT, WAIT]
 
190
 
191
  yield tuple(results)
192
 
193
  order = ["lesson","qa","mcq","summary","infographic"]
194
 
195
- for i,key in enumerate(order):
 
196
  res = ask_llm(model_label, prompts[key], token)
 
197
  results[i] = res
 
198
  yield tuple(results)
199
 
 
200
  # --------------------------------------------------
201
  # UI
202
  # --------------------------------------------------
203
 
204
  CSS = """
205
- @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
206
- body, .gradio-container {
207
- font-family: 'Inter', sans-serif !important;
208
- }
209
- .app-header {
210
- background: linear-gradient(135deg,
211
- #1a1a2e 0%,
212
- #16213e 50%,
213
- #0f3460 100%);
214
- border-radius: 16px;
215
- padding: 28px 32px;
216
- margin-bottom: 8px;
217
- border: 1px solid rgba(99,102,241,0.3);
218
- }
219
- .app-header h1 {
220
- font-size: 2rem !important;
221
- font-weight: 700 !important;
222
- background: linear-gradient(90deg,
223
- #818cf8,
224
- #c084fc,
225
- #38bdf8);
226
- -webkit-background-clip: text;
227
- -webkit-text-fill-color: transparent;
228
- margin-bottom: 6px !important;
229
- }
230
- .app-header p {
231
- color:
232
- #94a3b8 !important;
233
- font-size: 0.95rem;
234
- }
235
- .generate-btn {
236
- background: linear-gradient(135deg,
237
- #6366f1,
238
- #8b5cf6) !important;
239
- border: none !important;
240
- border-radius: 12px !important;
241
- font-weight: 600 !important;
242
- font-size: 1rem !important;
243
- transition: all 0.2s ease !important;
244
- width: 100% !important;
245
- }
246
- .generate-btn:hover {
247
- transform: translateY(-2px) !important;
248
- box-shadow: 0 8px 25px rgba(99,102,241,0.4) !important;
249
  }
250
  """
251
 
 
252
  with gr.Blocks() as demo:
253
- # ── Header ──
254
- gr.HTML("""
255
- <div class="app-header">
256
- <h1>🎓 AI Study Material Generator</h1>
257
- <p>Generate lesson notes, Q&amp;A, MCQs, a Mindmap, and a Cheat Sheet from any topic
258
- or syllabus using SOTA open-source LLMs running entirely on your CPU via transformers.pipeline. No API key needed.</p>
259
- </div>
260
- """)
261
-
262
- # ── Input Row ──
263
- with gr.Row(equal_height=False):
264
-
265
- # Left: syllabus input (paste OR upload)
266
- with gr.Column(scale=4):
267
- with gr.Tabs():
268
- with gr.TabItem("✏️ Paste Text"):
269
- text_input = gr.Textbox(
270
- show_label=False,
271
- placeholder=(
272
- "Paste your syllabus, topic, or any content here…\n"
273
- "e.g. The Water Cycle, Neural Networks, World War II, Photosynthesis"
274
- ),
275
- lines=7,
276
- )
277
- with gr.TabItem("📂 Upload File"):
278
- gr.Markdown(
279
- "Upload a **PDF**, **Word (.docx)**, **plain text (.txt)**, "
280
- "or **image** (PNG / JPG / WEBP) — text is extracted automatically."
281
- )
282
- file_input = gr.File(
283
- label="Upload syllabus file",
284
- file_types=[".pdf", ".docx", ".doc", ".txt",
285
- ".png", ".jpg", ".jpeg", ".webp", ".bmp"],
286
- file_count="single",
287
- )
288
- file_preview = gr.Textbox(
289
- label="Extracted text preview",
290
- lines=4,
291
- interactive=False,
292
- placeholder="Text extracted from the file will appear here…",
293
- )
294
- # Live preview when file is uploaded
295
- file_input.change(
296
- fn=lambda f: extract_text_from_file(f) if f else "",
297
- inputs=file_input,
298
- outputs=file_preview,
299
- )
300
-
301
- # Right: model selector + generate button
302
- with gr.Column(scale=2):
303
  model_selector = gr.Dropdown(
304
  choices=ALL_MODEL_NAMES,
305
  value=ALL_MODEL_NAMES[0],
306
- label="🤖 Model (all run locally via pipeline)",
307
- info=(
308
- "Tier 1 = fastest / least RAM. "
309
- "Tier 3 = best quality / needs 6–8 GB RAM. "
310
- "Models download on first use."
311
- ),
312
  )
 
313
  token_box = gr.Textbox(
314
- label="🔑 HF Token (optional)",
315
- info="Required for gated models. Your token stays private.",
316
- type="password",
317
- placeholder="hf_...",
318
- )
319
- btn = gr.Button(
320
- "⚡ Generate Study Materials",
321
- variant="primary",
322
- size="lg",
323
- elem_classes=["generate-btn"],
324
  )
325
 
326
- gr.HTML("<hr style='margin:8px 0; border-color:rgba(99,102,241,0.2)'>")
327
 
328
- # ── Output Tabs ──
329
  with gr.Tabs():
330
- with gr.TabItem("📖 Lesson Plan"):
331
- lesson = gr.Markdown(value="*Results will appear here after generation.*")
332
- with gr.TabItem("❓ Q & A"):
333
- qa = gr.Markdown(value="*Results will appear here after generation.*")
334
- with gr.TabItem("✅ MCQs"):
335
- mcq = gr.Markdown(value="*Results will appear here after generation.*")
336
- with gr.TabItem("📝 Summary"):
337
- summary = gr.Markdown(value="*Results will appear here after generation.*")
338
- with gr.TabItem("📊 Cheat Sheet"):
339
- cheat = gr.Markdown(value="*Results will appear here after generation.*")
340
-
341
- # ── Footer ──
342
- gr.HTML("""
343
- <div style='text-align:center; color:
344
- #64748b; font-size:0.8rem; margin-top:12px;'>
345
- Built with 🤗 Gradio · Hugging Face Transformers — 100% open-source · runs offline on CPU
346
- </div>
347
- """)
348
-
349
- # ── Wire up button ──
350
  btn.click(
351
  fn=generate_content,
352
- inputs=[text_input, file_input, model_selector, token_box],
353
- outputs=[lesson, qa, mcq, summary, cheat],
354
  )
355
 
 
356
  demo.launch(
357
- theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
358
- css=CSS,
 
 
 
359
  )
 
7
  import pathlib
8
  import os
9
 
10
+
11
+ # --------------------------------------------------
12
+ # TOKEN RESOLUTION
13
+ # --------------------------------------------------
14
+
15
+ def resolve_token(ui_token):
16
+ if ui_token.strip():
17
+ return ui_token.strip()
18
+
19
+ env_token = os.getenv("hgface_tok")
20
+ if env_token:
21
+ return env_token.strip()
22
+
23
+ return ""
24
+
25
+
26
  # --------------------------------------------------
27
  # FILE TEXT EXTRACTION
28
  # --------------------------------------------------
29
 
30
+ SUPPORTED_EXT = (
31
+ ".pdf",".docx",".txt",".png",".jpg",".jpeg",".webp",".bmp",".tiff"
32
+ )
33
 
34
  def extract_text_from_file(filepath):
35
+
36
  if not filepath:
37
  return ""
38
 
 
42
  ext = pathlib.Path(filepath).suffix.lower()
43
 
44
  try:
45
+
46
  if ext == ".pdf":
47
  doc = fitz.open(filepath)
48
  text = []
 
59
  return f.read()
60
 
61
  elif ext in (".png",".jpg",".jpeg",".webp",".bmp",".tiff"):
62
+
63
  try:
64
  img = Image.open(filepath)
65
  return pytesseract.image_to_string(img)
66
+
67
  except Exception as e:
68
  return "OCR failed: " + str(e)
69
 
 
73
  except Exception as e:
74
  return "Could not read file: " + str(e)
75
 
76
+
77
  # --------------------------------------------------
78
+ # MODELS
79
  # --------------------------------------------------
80
 
81
  MODELS = {
82
+
83
  "Gemma 3 270M [0.6GB | Lightning-fast Edge]": "google/gemma-3-270m-it",
84
  "Qwen 3 0.6B GGUF [0.5GB | Classroom Assistant]": "Qwen/Qwen3-0.6B-GGUF",
85
  "TinyLlama 1.1B [0.5GB]": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
86
 
 
87
  "Qwen 3.5 2B [2.4GB | The Student Tutor]": "Qwen/Qwen3.5-2B",
88
  "Phi-4 Mini [1.8GB | Logical Powerhouse]": "microsoft/Phi-4-mini-instruct",
89
  "Gemma 3 1B [2.1GB | Stable & Coherent]": "google/gemma-3-1b-it",
90
 
 
91
  "Qwen 3.5 9B [7.8GB | BEST FOR LESSON PLANS]": "Qwen/Qwen3.5-9B",
92
  "Llama 3.1 8B [5.2GB | Industry Standard]": "meta-llama/Meta-Llama-3.1-8B-Instruct",
93
  "Mistral Small 3 [7.1GB | Concise & Accurate]": "mistralai/Mistral-Small-3-Instruct",
94
  "Gemma 3 9B [6.3GB | Creative & Safe]": "google/gemma-3-9b-it",
95
 
 
 
96
  "Mistral Small 12B [9.5GB | Perfect VRAM Balance]": "mistralai/Mistral-Nemo-Instruct-2407",
97
 
 
98
  "Qwen 3.5 27B [18GB | Dense Curriculum Architect]": "Qwen/Qwen3.5-27B",
 
 
 
 
 
99
  }
100
 
101
  ALL_MODEL_NAMES = list(MODELS.keys())
102
 
103
+
104
  # --------------------------------------------------
105
  # PIPELINE CACHE
106
  # --------------------------------------------------
 
108
  _pipeline_cache = {}
109
  _pipeline_lock = threading.Lock()
110
 
111
+
112
  def get_pipeline(model_id, hf_token):
113
+
114
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
115
 
116
  with _pipeline_lock:
117
+
118
  if model_id not in _pipeline_cache:
119
+
120
  try:
121
+
122
+ kwargs = {
123
+ "trust_remote_code": True
124
+ }
125
+
126
+ if hf_token:
127
+ kwargs["token"] = hf_token
128
 
129
  tokenizer = AutoTokenizer.from_pretrained(
130
  model_id,
131
+ **kwargs
 
132
  )
133
 
134
  model = AutoModelForCausalLM.from_pretrained(
135
  model_id,
136
+ device_map="cpu",
137
+ **kwargs
 
138
  )
139
 
140
  pipe = pipeline(
 
150
 
151
  return _pipeline_cache[model_id], None
152
 
153
+
154
  # --------------------------------------------------
155
  # INFERENCE
156
  # --------------------------------------------------
157
 
158
  SYSTEM_MSG = "You are an expert educational assistant. Use markdown."
159
 
160
+ def ask_llm(model_label, prompt, hf_token=""):
161
+
162
+ token = resolve_token(hf_token)
163
+
164
  model_id = MODELS[model_label]
165
+
166
+ pipe, err = get_pipeline(model_id, token)
167
 
168
  if err:
169
  return "Model load error:\n" + err
170
 
171
  try:
172
+
173
  combined = SYSTEM_MSG + "\n\n" + prompt
174
+
175
  out = pipe(
176
  combined,
177
  max_new_tokens=2048,
 
182
 
183
  text = out[0]["generated_text"]
184
 
 
185
  if text.startswith(combined):
186
  text = text[len(combined):]
187
 
 
190
  except Exception as e:
191
  return "Inference error:\n" + str(e)
192
 
193
+
194
  # --------------------------------------------------
195
  # PROMPTS
196
  # --------------------------------------------------
197
 
198
  def make_prompts(topic):
199
+
200
  return {
201
+
202
+ "lesson":
203
+ "Create a lesson plan with headings and bullet points.\n\nTopic:\n"+topic,
204
+
205
+ "qa":
206
+ "Generate 10 exam questions with answers.\n\nTopic:\n"+topic,
207
+
208
+ "mcq":
209
+ "Generate 10 MCQs with 4 options and answers.\n\nTopic:\n"+topic,
210
+
211
+ "summary":
212
+ "Summarize the topic in 250-300 words.\n\nTopic:\n"+topic,
213
+
214
+ "infographic":
215
+ "Create a cheat sheet using tables and bullet points.\n\nTopic:\n"+topic
216
  }
217
 
218
+
219
  def generate_content(text, file, model_label, token):
220
  file_text = extract_text_from_file(file) if file else ""
221
 
 
226
  return
227
 
228
  prompts = make_prompts(syllabus)
229
+
230
  WAIT = "Generating..."
231
+
232
+ results = [WAIT,WAIT,WAIT,WAIT,WAIT]
233
 
234
  yield tuple(results)
235
 
236
  order = ["lesson","qa","mcq","summary","infographic"]
237
 
238
+ for i, key in enumerate(order):
239
+
240
  res = ask_llm(model_label, prompts[key], token)
241
+
242
  results[i] = res
243
+
244
  yield tuple(results)
245
 
246
+
247
  # --------------------------------------------------
248
  # UI
249
  # --------------------------------------------------
250
 
251
  CSS = """
252
+ body,.gradio-container{
253
+ font-family:Inter,sans-serif!important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  }
255
  """
256
 
257
+
258
  with gr.Blocks() as demo:
259
+ gr.Markdown("# 🎓 AI Study Material Generator")
260
+
261
+ with gr.Row():
262
+ with gr.Column():
263
+ text_input = gr.Textbox(
264
+ placeholder="Paste syllabus or topic",
265
+ lines=6
266
+ )
267
+
268
+ file_input = gr.File(
269
+ label="Upload syllabus file"
270
+ )
271
+
272
+ with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  model_selector = gr.Dropdown(
274
  choices=ALL_MODEL_NAMES,
275
  value=ALL_MODEL_NAMES[0],
276
+ label="Model"
 
 
 
 
 
277
  )
278
+
279
  token_box = gr.Textbox(
280
+ label="HF Token (optional)",
281
+ type="password"
 
 
 
 
 
 
 
 
282
  )
283
 
284
+ btn = gr.Button("Generate")
285
 
 
286
  with gr.Tabs():
287
+ with gr.TabItem("Lesson"):
288
+ lesson = gr.Markdown()
289
+
290
+ with gr.TabItem("Q&A"):
291
+ qa = gr.Markdown()
292
+
293
+ with gr.TabItem("MCQ"):
294
+ mcq = gr.Markdown()
295
+
296
+ with gr.TabItem("Summary"):
297
+ summary = gr.Markdown()
298
+
299
+ with gr.TabItem("Cheat Sheet"):
300
+ cheat = gr.Markdown()
301
+
 
 
 
 
 
302
  btn.click(
303
  fn=generate_content,
304
+ inputs=[text_input,file_input,model_selector,token_box],
305
+ outputs=[lesson,qa,mcq,summary,cheat]
306
  )
307
 
308
+
309
  demo.launch(
310
+ theme=gr.themes.Soft(
311
+ primary_hue="indigo",
312
+ secondary_hue="purple"
313
+ ),
314
+ css=CSS
315
  )