Sazid2 commited on
Commit
5ca1bf0
·
verified ·
1 Parent(s): f44e216

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +446 -0
app.py ADDED
@@ -0,0 +1,446 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import os
3
+ import io
4
+ import sqlite3
5
+ from datetime import datetime
6
+ import fitz # PyMuPDF
7
+ import numpy as np
8
+ from PIL import Image
9
+ import gradio as gr
10
+ import faiss
11
+ import pytesseract
12
+ from sentence_transformers import SentenceTransformer
13
+ import sympy as sp
14
+
15
+ # Optional: huggingface inference
16
+ from huggingface_hub import InferenceApi
17
+
18
+ # ------------- CONFIG -------------
19
+ APP_NAME = "Jajabor – SEBA Assamese Class 10 Tutor (Spaces)"
20
+ BASE_DIR = os.path.abspath(os.path.dirname(__file__))
21
+ PDF_DIR = os.path.join(BASE_DIR, "pdfs", "class10")
22
+ DB_PATH = os.path.join(BASE_DIR, "jajabor_users.db")
23
+
24
+ # Embedding model - compact for Spaces. Swap if you run on stronger infra.
25
+ EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
26
+
27
+ # LLM model to call via Inference API (optional)
28
+ # WARNING: not all large models will run under a free plan; see docs.
29
+ LLM_MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct" # can change to a hosted model
30
+ USE_HF_INFERENCE = True # set False if you plan to load a local small model
31
+
32
+ CHUNK_SIZE = 600
33
+ CHUNK_OVERLAP = 120
34
+ TOP_K = 5
35
+
36
+ HUGGINGFACE_API_TOKEN = os.environ.get("HF_API_TOKEN", None)
37
+ if USE_HF_INFERENCE:
38
+ if not HUGGINGFACE_API_TOKEN:
39
+ print("Warning: HF API token not found in env (HF_API_TOKEN). LLM calls will fail.")
40
+ else:
41
+ inference = InferenceApi(repo_id=LLM_MODEL_NAME, token=HUGGINGFACE_API_TOKEN)
42
+
43
+ # ------------- DB helpers -------------
44
+ def init_db(db_path=DB_PATH):
45
+ os.makedirs(os.path.dirname(db_path), exist_ok=True)
46
+ conn = sqlite3.connect(db_path)
47
+ cur = conn.cursor()
48
+ cur.execute(
49
+ """
50
+ CREATE TABLE IF NOT EXISTS users (
51
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
52
+ username TEXT UNIQUE,
53
+ created_at TEXT
54
+ )
55
+ """
56
+ )
57
+ cur.execute(
58
+ """
59
+ CREATE TABLE IF NOT EXISTS interactions (
60
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
61
+ user_id INTEGER,
62
+ timestamp TEXT,
63
+ query TEXT,
64
+ answer TEXT,
65
+ is_math INTEGER,
66
+ FOREIGN KEY(user_id) REFERENCES users(id)
67
+ )
68
+ """
69
+ )
70
+ conn.commit()
71
+ conn.close()
72
+
73
+ def get_or_create_user(username: str):
74
+ username = username.strip()
75
+ if not username:
76
+ return None
77
+ conn = sqlite3.connect(DB_PATH)
78
+ cur = conn.cursor()
79
+ cur.execute("SELECT id FROM users WHERE username=?", (username,))
80
+ row = cur.fetchone()
81
+ if row:
82
+ user_id = row[0]
83
+ else:
84
+ cur.execute(
85
+ "INSERT INTO users (username, created_at) VALUES (?, ?)",
86
+ (username, datetime.utcnow().isoformat()),
87
+ )
88
+ conn.commit()
89
+ user_id = cur.lastrowid
90
+ conn.close()
91
+ return user_id
92
+
93
+ def log_interaction(user_id, query, answer, is_math: bool):
94
+ conn = sqlite3.connect(DB_PATH)
95
+ cur = conn.cursor()
96
+ cur.execute(
97
+ """
98
+ INSERT INTO interactions (user_id, timestamp, query, answer, is_math)
99
+ VALUES (?, ?, ?, ?, ?)
100
+ """,
101
+ (user_id, datetime.utcnow().isoformat(), query, answer, 1 if is_math else 0),
102
+ )
103
+ conn.commit()
104
+ conn.close()
105
+
106
+ def get_user_stats(user_id):
107
+ conn = sqlite3.connect(DB_PATH)
108
+ cur = conn.cursor()
109
+ cur.execute(
110
+ "SELECT COUNT(*), SUM(is_math) FROM interactions WHERE user_id=?", (user_id,)
111
+ )
112
+ row = cur.fetchone()
113
+ conn.close()
114
+ total = row[0] or 0
115
+ math_count = row[1] or 0
116
+ return total, math_count
117
+
118
+ init_db()
119
+
120
+ # ------------- PDF loading + RAG -------------
121
+ def extract_text_from_pdf(pdf_path: str) -> str:
122
+ doc = fitz.open(pdf_path)
123
+ pages = []
124
+ for page in doc:
125
+ txt = page.get_text("text")
126
+ if txt:
127
+ pages.append(txt)
128
+ return "\n".join(pages)
129
+
130
+ def load_all_pdfs(pdf_dir: str):
131
+ texts = []
132
+ metas = []
133
+ if not os.path.isdir(pdf_dir):
134
+ print("PDF_DIR not found:", pdf_dir)
135
+ return texts, metas
136
+ for fname in os.listdir(pdf_dir):
137
+ if fname.lower().endswith(".pdf"):
138
+ path = os.path.join(pdf_dir, fname)
139
+ print("Reading:", path)
140
+ text = extract_text_from_pdf(path)
141
+ texts.append(text)
142
+ metas.append({"source": fname})
143
+ return texts, metas
144
+
145
+ def split_text(text: str, chunk_size=600, overlap=120):
146
+ chunks = []
147
+ start = 0
148
+ while start < len(text):
149
+ end = start + chunk_size
150
+ chunk = text[start:end]
151
+ if chunk.strip():
152
+ chunks.append(chunk)
153
+ start = max(end - overlap, end) # avoid infinite loop
154
+ return chunks
155
+
156
+ print("Loading embedding model:", EMBEDDING_MODEL_NAME)
157
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
158
+
159
+ print("Loading PDFs from", PDF_DIR)
160
+ all_texts, all_metas = load_all_pdfs(PDF_DIR)
161
+ print("Number of PDFs:", len(all_texts))
162
+
163
+ corpus_chunks = []
164
+ corpus_metas = []
165
+ for text, meta in zip(all_texts, all_metas):
166
+ chs = split_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
167
+ corpus_chunks.extend(chs)
168
+ corpus_metas.extend([meta] * len(chs))
169
+
170
+ print("Total chunks:", len(corpus_chunks))
171
+ if len(corpus_chunks) > 0:
172
+ print("Encoding chunks...")
173
+ embs = embedding_model.encode(corpus_chunks, batch_size=32, show_progress_bar=False).astype("float32")
174
+ dim = embs.shape[1]
175
+ index = faiss.IndexFlatL2(dim)
176
+ index.add(embs)
177
+ print("FAISS index ready; dim:", dim)
178
+ else:
179
+ index = None
180
+ print("No corpus chunks - upload PDFs to the `pdfs/class10` folder in the repo.")
181
+
182
+ def rag_search(query: str, k: int = TOP_K):
183
+ if index is None:
184
+ return []
185
+ q_vec = embedding_model.encode([query]).astype("float32")
186
+ D, I = index.search(q_vec, k)
187
+ results = []
188
+ for dist, idx in zip(D[0], I[0]):
189
+ if idx == -1:
190
+ continue
191
+ results.append(
192
+ {
193
+ "score": float(dist),
194
+ "text": corpus_chunks[idx],
195
+ "meta": corpus_metas[idx],
196
+ }
197
+ )
198
+ return results
199
+
200
+ # ------------- LLM helpers -------------
201
+ SYSTEM_PROMPT = """
202
+ You are "Jajabor", an expert SEBA Assamese tutor for Class 10.
203
+ Always prefer to answer in Assamese. If the student clearly asks for English, you may reply in English.
204
+
205
+ Rules:
206
+ - Use ONLY the given textbook context.
207
+ - If you are not sure, say: "এই প্ৰশ্নটো পাঠ্যপুথিৰ অংশত স্পষ্টকৈ নাই, সেয়েহে মই নিশ্চিত নহয়।"
208
+ - বোঝাপৰা সহজ ভাষাত ব্যাখ্যা কৰা, উদাহৰণ দিয়ক।
209
+ - If it is a maths question, explain step-by-step clearly.
210
+ """
211
+
212
+ def build_rag_prompt(context_blocks, question, chat_history):
213
+ ctx = ""
214
+ for i, block in enumerate(context_blocks, start=1):
215
+ src = block["meta"].get("source", "textbook")
216
+ ctx += f"\n[Context {i} – {src}]\n{block['text']}\n"
217
+
218
+ hist = ""
219
+ for role, msg in chat_history:
220
+ hist += f"{role}: {msg}\n"
221
+
222
+ prompt = f"""{SYSTEM_PROMPT}
223
+
224
+ পূৰ্বৰ বাৰ্তাসমূহ:
225
+ {hist}
226
+
227
+ সদস্যৰ প্ৰশ্ন:
228
+ {question}
229
+
230
+ সম্পৰ্কিত পাঠ্যপুথিৰ অংশ:
231
+ {ctx}
232
+
233
+ এতিয়া একেদম সহায়ক আৰু বুজিবলৈ সহজ উত্তৰ দিয়া।
234
+ """
235
+ return prompt
236
+
237
+ def call_llm_via_hf(prompt: str, max_tokens=512):
238
+ if not HUGGINGFACE_API_TOKEN:
239
+ return "LLM not available: HF API token (env HF_API_TOKEN) is required to call the Inference API."
240
+ try:
241
+ # huggingface InferenceApi text-generation returns text (model-specific format)
242
+ out = inference(inputs=prompt, params={"max_new_tokens": max_tokens, "temperature": 0.3})
243
+ # inference result may be a dict or string; try to extract
244
+ if isinstance(out, dict) and "generated_text" in out:
245
+ return out["generated_text"]
246
+ if isinstance(out, list) and len(out) > 0 and "generated_text" in out[0]:
247
+ return out[0]["generated_text"]
248
+ if isinstance(out, str):
249
+ return out
250
+ return str(out)
251
+ except Exception as e:
252
+ return f"LLM call failed: {e}"
253
+
254
+ def llm_answer_with_rag(question: str, chat_history):
255
+ retrieved = rag_search(question, TOP_K)
256
+ prompt = build_rag_prompt(retrieved, question, chat_history)
257
+ if USE_HF_INFERENCE:
258
+ return call_llm_via_hf(prompt)
259
+ else:
260
+ return "LLM not configured (USE_HF_INFERENCE=False)."
261
+
262
+ # ------------- OCR + math helpers -------------
263
+ def ocr_from_image(img: Image.Image):
264
+ if img is None:
265
+ return ""
266
+ img = img.convert("RGB")
267
+ try:
268
+ text = pytesseract.image_to_string(img, lang="asm+eng")
269
+ except Exception:
270
+ text = pytesseract.image_to_string(img)
271
+ return text.strip()
272
+
273
+ def is_likely_math(text: str) -> bool:
274
+ math_chars = set("0123456789+-*/=^()%")
275
+ if any(ch in text for ch in math_chars):
276
+ return True
277
+ kws = ["গণিত", "সমীকৰণ", "উদাহৰণ", "প্ৰশ্ন", "বীজগণিত"]
278
+ return any(k in text for k in kws)
279
+
280
+ def solve_math_expression(expr: str):
281
+ try:
282
+ expr = expr.replace("^", "**")
283
+ if "=" in expr:
284
+ left, right = expr.split("=", 1)
285
+ left_s = sp.sympify(left)
286
+ right_s = sp.sympify(right)
287
+ eq = sp.Eq(left_s, right_s)
288
+ sol = sp.solve(eq)
289
+ steps = []
290
+ steps.append("প্ৰথমে সমীকৰণ লওঁ:")
291
+ steps.append(f"{sp.pretty(eq)}")
292
+ steps.append("Sympy ৰ সহায়ত সমাধান পোৱা যায়:")
293
+ steps.append(str(sol))
294
+ explanation = "ধাপ-ধাপে সমাধান (সংক্ষেপে):\n" + "\n".join(f"- {s}" for s in steps)
295
+ explanation += f"\n\nসেয়েহে সমাধান: {sol}"
296
+ else:
297
+ expr_s = sp.sympify(expr)
298
+ simp = sp.simplify(expr_s)
299
+ explanation = (
300
+ "প্ৰদত্ত গণিতীয় অভিব্যক্তি:\n"
301
+ f"{expr}\n\nসরলীকৰণ কৰাৰ পিছত পোৱা যায়:\n{simp}"
302
+ )
303
+ return explanation
304
+ except Exception:
305
+ return (
306
+ "মই সঠিকভাৱে গণিতীয় অভিব্যক্তি চিনাক্ত কৰিব নোৱাৰিলোঁ। "
307
+ "দয়া কৰি সমীকৰণটো অলপ বেছি স্পষ্টকৈ লিখা: উদাহৰণ – 2x + 3 = 7"
308
+ )
309
+
310
+ def speech_to_text(audio):
311
+ return ""
312
+
313
+ def text_to_speech(text: str):
314
+ return None
315
+
316
+ # ------------- Chat logic -------------
317
+ def login_user(username, user_state):
318
+ username = (username or "").strip()
319
+ if not username:
320
+ return user_state, "⚠️ অনুগ্ৰহ কৰি প্ৰথমে লগিনৰ বাবে এটা নাম লিখক।"
321
+ user_id = get_or_create_user(username)
322
+ user_state = {"username": username, "user_id": user_id}
323
+ total, math_count = get_user_stats(user_id)
324
+ stats = (
325
+ f"👤 ব্যৱহাৰকাৰী: **{username}**\n\n"
326
+ f"📊 মোট প্ৰশ্ন: **{total}**\n"
327
+ f"🧮 গণিত প্ৰশ্ন: **{math_count}**"
328
+ )
329
+ return user_state, stats
330
+
331
+ def chat_logic(
332
+ username,
333
+ text_input,
334
+ image_input,
335
+ audio_input,
336
+ chat_history,
337
+ user_state,
338
+ ):
339
+ if not user_state or not user_state.get("user_id"):
340
+ sys_msg = "⚠️ প্ৰথমে ওপৰত আপোনাৰ নাম লিখি **Login / লগিন** টিপক।"
341
+ chat_history = chat_history + [[text_input or "", sys_msg]]
342
+ return chat_history, user_state, None
343
+
344
+ user_id = user_state["user_id"]
345
+
346
+ final_query_parts = []
347
+ voice_text = speech_to_text(audio_input)
348
+ if voice_text:
349
+ final_query_parts.append(voice_text)
350
+
351
+ ocr_text = ""
352
+ if image_input is not None:
353
+ try:
354
+ img = Image.open(io.BytesIO(image_input.read()))
355
+ except Exception:
356
+ img = image_input
357
+ ocr_text = ocr_from_image(img)
358
+ if ocr_text:
359
+ final_query_parts.append(ocr_text)
360
+
361
+ if text_input:
362
+ final_query_parts.append(text_input)
363
+
364
+ if not final_query_parts:
365
+ sys_msg = "⚠️ অনুগ্ৰহ কৰি প্ৰশ্ন লিখক, কিম্বা ছবি আপলোড কৰক।"
366
+ chat_history = chat_history + [["", sys_msg]]
367
+ return chat_history, user_state, None
368
+
369
+ full_query = "\n".join(final_query_parts)
370
+ conv = []
371
+ for u, b in chat_history:
372
+ if u:
373
+ conv.append(("Student", u))
374
+ if b:
375
+ conv.append(("Tutor", b))
376
+
377
+ is_math = is_likely_math(full_query)
378
+ if is_math:
379
+ math_answer = solve_math_expression(full_query)
380
+ combined_question = (
381
+ full_query
382
+ + "\n\nগণিত প্ৰোগ্ৰামে এই ফলাফল দিছে:\n"
383
+ + math_answer
384
+ + "\n\nঅনুগ্ৰহ কৰি শ্রেণী ১০ ৰ শিক্ষাৰ্থীৰ বাবে সহজ ভাষাত ব্যাখ্যা কৰক।"
385
+ )
386
+ final_answer = llm_answer_with_rag(combined_question, conv)
387
+ else:
388
+ final_answer = llm_answer_with_rag(full_query, conv)
389
+
390
+ log_interaction(user_id, full_query, final_answer, is_math)
391
+ audio_out = text_to_speech(final_answer)
392
+ display_question = text_input or voice_text or ocr_text or "(empty)"
393
+ chat_history = chat_history + [[display_question, final_answer]]
394
+ return chat_history, user_state, audio_out
395
+
396
+ # ------------- Gradio UI -------------
397
+ with gr.Blocks(title=APP_NAME) as demo:
398
+ gr.Markdown(
399
+ """
400
+ # 🧭 জাজাবৰ – SEBA অসমীয়া ক্লাছ ১০ AI Tutor (Spaces)
401
+
402
+ - Upload your SEBA Class 10 PDFs to `pdfs/class10` in this Space repo
403
+ - Text + Image (OCR) input
404
+ - Math step-by-step solutions
405
+ - User login + progress
406
+ """
407
+ )
408
+
409
+ user_state = gr.State({})
410
+
411
+ with gr.Row():
412
+ with gr.Column(scale=1):
413
+ gr.Markdown("### 👤 লগিন")
414
+ username_inp = gr.Textbox(label="নাম / ইউজাৰ আইডি", placeholder="উদাহৰণ: abu10")
415
+ login_btn = gr.Button("✅ Login / লগিন")
416
+ stats_md = gr.Markdown("এতিয়ালৈকে লগিন হোৱা নাই।", elem_classes="stats-box")
417
+ with gr.Column(scale=3):
418
+ chat = gr.Chatbot(label="জাজাবৰ সৈতে কথোপকথন", height=500)
419
+ text_inp = gr.Textbox(label="আপোনাৰ প্ৰশ্ন লিখক", lines=2)
420
+ with gr.Row():
421
+ image_inp = gr.Image(label="📷 প্ৰশ্নৰ ছবি (Optional)", type="file")
422
+ audio_inp = gr.Audio(label="🎙️ কণ্ঠস্বৰ প্ৰশ্ন (Stub)", type="numpy")
423
+ with gr.Row():
424
+ ask_btn = gr.Button("🤖 জাজাবৰক সোধক")
425
+ audio_out = gr.Audio(label="🔊 উত্তৰৰ অডিঅ’ (TTS – future)", interactive=False)
426
+
427
+ login_btn.click(login_user, inputs=[username_inp, user_state], outputs=[user_state, stats_md])
428
+
429
+ def wrapped_chat(text, image, audio, history, user_state_inner, username_inner):
430
+ if user_state_inner and username_inner and not user_state_inner.get("username"):
431
+ user_state_inner["username"] = username_inner
432
+ return chat_logic(username_inner, text, image, audio, history, user_state_inner)
433
+
434
+ ask_btn.click(
435
+ wrapped_chat,
436
+ inputs=[text_inp, image_inp, audio_inp, chat, user_state, username_inp],
437
+ outputs=[chat, user_state, audio_out],
438
+ )
439
+ text_inp.submit(
440
+ wrapped_chat,
441
+ inputs=[text_inp, image_inp, audio_inp, chat, user_state, username_inp],
442
+ outputs=[chat, user_state, audio_out],
443
+ )
444
+
445
+ if __name__ == "__main__":
446
+ demo.launch()