Sazid2 commited on
Commit
d7fa455
·
verified ·
1 Parent(s): 16bf99d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +394 -202
app.py CHANGED
@@ -1,15 +1,24 @@
1
  # app.py
2
  """
3
- Jajabor – SEBA Assamese Class 10 Tutor (Free-tier CPU version using pypdf2)
 
 
 
 
 
 
 
 
 
4
  """
5
 
6
  import os
7
  import io
8
  import sqlite3
9
- from datetime import datetime
10
  import traceback
 
11
 
12
- from pypdf2 import PdfReader # <-- FIXED
13
  import numpy as np
14
  from PIL import Image
15
  import gradio as gr
@@ -20,16 +29,15 @@ import sympy as sp
20
 
21
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
22
 
23
- # ----------------- CONFIG -----------------
24
- APP_NAME = "Jajabor – SEBA Assamese Class 10 Tutor (Free, pypdf2)"
25
 
26
  BASE_DIR = os.path.abspath(os.path.dirname(__file__))
27
  PDF_DIR = os.path.join(BASE_DIR, "pdfs", "class10")
28
  DB_PATH = os.path.join(BASE_DIR, "jajabor_users.db")
29
 
30
  EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
31
- USE_HF_INFERENCE = False
32
-
33
  LLM_LOCAL_NAME = "google/flan-t5-small"
34
  LLM_MAX_TOKENS = 128
35
 
@@ -37,19 +45,22 @@ CHUNK_SIZE = 600
37
  CHUNK_OVERLAP = 120
38
  TOP_K = 5
39
 
40
- # ----------------- DB -----------------
41
  def init_db(path=DB_PATH):
42
  os.makedirs(os.path.dirname(path), exist_ok=True)
43
  conn = sqlite3.connect(path)
44
  cur = conn.cursor()
45
- cur.execute("""
 
46
  CREATE TABLE IF NOT EXISTS users (
47
  id INTEGER PRIMARY KEY AUTOINCREMENT,
48
  username TEXT UNIQUE,
49
  created_at TEXT
50
  )
51
- """)
52
- cur.execute("""
 
 
53
  CREATE TABLE IF NOT EXISTS interactions (
54
  id INTEGER PRIMARY KEY AUTOINCREMENT,
55
  user_id INTEGER,
@@ -59,291 +70,472 @@ def init_db(path=DB_PATH):
59
  is_math INTEGER,
60
  FOREIGN KEY(user_id) REFERENCES users(id)
61
  )
62
- """)
 
63
  conn.commit()
64
  conn.close()
65
 
66
- init_db()
67
-
68
- def get_or_create_user(username):
 
69
  conn = sqlite3.connect(DB_PATH)
70
  cur = conn.cursor()
71
  cur.execute("SELECT id FROM users WHERE username=?", (username,))
72
  row = cur.fetchone()
73
  if row:
74
- uid = row[0]
75
  else:
76
  cur.execute(
77
  "INSERT INTO users (username, created_at) VALUES (?, ?)",
78
- (username, datetime.utcnow().isoformat())
79
  )
80
  conn.commit()
81
- uid = cur.lastrowid
82
  conn.close()
83
- return uid
84
 
85
- def log_interaction(uid, q, a, is_math):
86
  conn = sqlite3.connect(DB_PATH)
87
  cur = conn.cursor()
88
- cur.execute("""
 
89
  INSERT INTO interactions (user_id, timestamp, query, answer, is_math)
90
  VALUES (?, ?, ?, ?, ?)
91
- """, (uid, datetime.utcnow().isoformat(), q, a, 1 if is_math else 0))
 
 
92
  conn.commit()
93
  conn.close()
94
 
95
- def get_stats(uid):
96
  conn = sqlite3.connect(DB_PATH)
97
  cur = conn.cursor()
98
- cur.execute("SELECT COUNT(*), SUM(is_math) FROM interactions WHERE user_id=?", (uid,))
 
 
99
  row = cur.fetchone()
100
  conn.close()
101
- return row[0] or 0, row[1] or 0
 
 
 
 
102
 
103
- # ----------------- PDF LOADING (pypdf2) -----------------
104
- def extract_text_from_pdf(pdf_path):
105
- text = []
106
  try:
107
  reader = PdfReader(pdf_path)
108
  for page in reader.pages:
109
  try:
110
- t = page.extract_text() or ""
111
- text.append(t)
112
- except:
113
  continue
114
  except Exception as e:
115
  print("PDF read error:", e)
116
- return "\n".join(text)
117
 
118
- def load_all_pdfs(pdf_dir):
119
  texts = []
120
  metas = []
121
  if not os.path.isdir(pdf_dir):
122
- print("PDF folder missing", pdf_dir)
123
  return texts, metas
124
-
125
- for fn in sorted(os.listdir(pdf_dir)):
126
- if fn.lower().endswith(".pdf"):
127
- p = os.path.join(pdf_dir, fn)
128
- print("Reading:", p)
129
- t = extract_text_from_pdf(p)
130
- texts.append(t)
131
- metas.append({"source": fn})
132
  return texts, metas
133
 
134
- def split_text(txt, size, overlap):
135
- if not txt:
136
  return []
137
- out = []
138
- step = size - overlap
139
- i = 0
140
- while i < len(txt):
141
- ch = txt[i:i+size]
142
- if ch.strip():
143
- out.append(ch)
144
- i += step
145
- return out
146
-
147
- # ----------------- Embeddings -----------------
148
- print("Loading embeddings...")
149
- embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
150
-
151
- texts, metas = load_all_pdfs(PDF_DIR)
152
-
153
- chunks = []
154
- chunk_meta = []
155
- for t, m in zip(texts, metas):
156
- c = split_text(t, CHUNK_SIZE, CHUNK_OVERLAP)
157
- chunks.extend(c)
158
- chunk_meta.extend([m] * len(c))
159
-
160
- print("Total chunks:", len(chunks))
161
-
 
 
 
162
  index = None
163
- if chunks:
164
- print("Encoding...")
165
  try:
166
- emb = embedder.encode(chunks, batch_size=32, show_progress_bar=False)
167
- emb = emb.astype("float32")
168
- index = faiss.IndexFlatL2(emb.shape[1])
169
- index.add(emb)
170
- print("FAISS Ready")
171
  except Exception as e:
172
- print("FAISS error:", e)
 
 
 
173
 
174
- def rag_search(q):
175
  if index is None:
176
  return []
177
  try:
178
- qv = embedder.encode([q]).astype("float32")
179
- D, I = index.search(qv, TOP_K)
180
- res = []
181
- for d, idx in zip(D[0], I[0]):
182
- if idx != -1:
183
- res.append({"score": float(d), "text": chunks[idx], "meta": chunk_meta[idx]})
184
- return res
185
- except:
 
 
 
 
 
 
 
 
186
  return []
187
 
188
- # ----------------- Local LLM -----------------
189
- print("Loading CPU LLM:", LLM_LOCAL_NAME)
 
190
  try:
191
- tok = AutoTokenizer.from_pretrained(LLM_LOCAL_NAME)
192
- mdl = AutoModelForSeq2SeqLM.from_pretrained(LLM_LOCAL_NAME)
193
- llm = pipeline("text2text-generation", model=mdl, tokenizer=tok)
194
- except:
195
- llm = None
 
 
196
 
197
  SYSTEM_PROMPT = """
198
- You are Jajabor, a SEBA Assamese tutor.
199
- Answer in Assamese. Use textbook context.
 
 
 
 
 
 
200
  """
201
 
202
- def build_prompt(ctx, q, hist):
203
- C = ""
204
- for i, b in enumerate(ctx, 1):
205
- C += f"\n[Context {i} – {b['meta'].get('source')}]\n{b['text']}\n"
206
- H = ""
207
- for r, m in hist:
208
- H += f"{r}: {m}\n"
209
- return f"""{SYSTEM_PROMPT}
210
-
211
- আগৰ কথোপকথন:
212
- {H}
213
 
214
- প্ৰশ্ন:
215
- {q}
 
216
 
217
- সম্পৰ্কিত পাঠ্য:
218
- {C}
219
 
220
- সহজ ভাষাত উত্তৰ দিয়া।
221
- """
222
 
223
- def answer_with_rag(q, hist):
224
- ctx = rag_search(q)
225
- prompt = build_prompt(ctx, q, hist)
226
 
227
- if llm is None:
228
- return "LLM not available."
229
 
230
- try:
231
- out = llm(prompt, max_new_tokens=LLM_MAX_TOKENS, do_sample=False)
232
- return out[0]["generated_text"]
233
- except Exception as e:
234
- return f"LLM error: {e}"
235
 
236
- # ----------------- OCR + Math -----------------
237
- def ocr_image(img):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  try:
239
  img = img.convert("RGB")
240
- return pytesseract.image_to_string(img, lang="asm+eng").strip()
241
- except:
242
- return ""
243
-
244
- def is_math_query(q):
245
- if any(c in q for c in "0123456789+-*/=^()%"):
 
 
 
 
 
 
 
 
 
 
246
  return True
247
- return any(k in q.lower() for k in ["গণিত", "math", "সমীকৰণ"])
 
248
 
249
- def math_solve(expr):
250
  try:
251
  expr = expr.replace("^", "**")
252
  if "=" in expr:
253
- L, R = expr.split("=", 1)
254
- eq = sp.Eq(sp.sympify(L), sp.sympify(R))
 
 
255
  sol = sp.solve(eq)
256
- return f"সমাধান: {sol}"
 
 
 
 
 
 
 
257
  else:
258
- s = sp.simplify(sp.sympify(expr))
259
- return f"উত্তৰ: {s}"
260
- except:
261
- return "সমীকৰণ বুজিবলৈ অসুবিধা।"
262
-
263
- # ----------------- CHAT -----------------
264
- def login(username, state):
265
- username = (username or "").strip()
266
- if not username:
267
- return state, "⚠️ লগিনৰ নাম দিয়ক।"
268
- uid = get_or_create_user(username)
269
- state = {"username": username, "user_id": uid}
270
- t, m = get_stats(uid)
271
- return state, f"ব্যৱহাৰকাৰী: {username}\nমোট প্ৰশ্ন: {t}\nগণিত: {m}"
272
-
273
- def chat_logic(username, text, img, aud, hist, state):
274
- if hist is None:
275
- hist = []
276
 
277
- if not state or "user_id" not in state:
278
- return hist + [[text or "", "⚠️ লগিন কৰক।"]], state, ""
279
 
280
- q_parts = []
281
- if text:
282
- q_parts.append(text)
283
 
284
- if img:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  try:
286
- im = Image.open(img)
287
- o = ocr_image(im)
288
- if o:
289
- q_parts.append(o)
290
- except:
291
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
- if not q_parts:
294
- return hist + [["", "⚠️ প্ৰশ্ন লিখক।"]], state, ""
 
 
295
 
296
- full_q = "\n".join(q_parts)
297
 
298
  conv = []
299
- for u, a in hist:
300
  if u:
301
  conv.append(("Student", u))
302
- if a:
303
- conv.append(("Tutor", a))
304
-
305
- if is_math_query(full_q):
306
- sol = math_solve(full_q)
307
- final_q = f"{full_q}\n\nMath result:\n{sol}\n\nExplain in simple Assamese."
 
 
 
 
 
 
 
 
308
  else:
309
- final_q = full_q
310
 
311
- ans = answer_with_rag(final_q, conv)
312
- log_interaction(state["user_id"], full_q, ans, is_math_query(full_q))
313
 
314
- return hist + [[full_q, ans]], state, ""
 
 
 
315
 
316
- # ----------------- UI -----------------
317
- with gr.Blocks(title=APP_NAME) as demo:
318
- gr.Markdown("# 🧭 Jajabor – SEBA Assamese Class 10 Tutor (Free, pypdf2)")
319
 
320
- state = gr.State({})
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
  with gr.Row():
323
  with gr.Column(scale=1):
324
- user = gr.Textbox(label="নাম")
325
- login_btn = gr.Button("Login")
326
- stats = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  with gr.Column(scale=3):
328
- chat = gr.Chatbot(height=500)
329
- txt = gr.Textbox(label="প্ৰশ্ন", lines=2)
330
- img = gr.Image(type="filepath", label="📷 ছবি")
331
- aud = gr.Audio(type="filepath", label="🎙️")
332
- ask = gr.Button("সোধক")
333
-
334
- login_btn.click(login, inputs=[user, state], outputs=[state, stats])
335
-
336
- ask.click(
337
- chat_logic,
338
- inputs=[user, txt, img, aud, chat, state],
339
- outputs=[chat, state, None]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  )
341
 
342
- txt.submit(
343
- chat_logic,
344
- inputs=[user, txt, img, aud, chat, state],
345
- outputs=[chat, state, None]
346
  )
347
 
 
348
  if __name__ == "__main__":
 
349
  demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
1
  # app.py
2
  """
3
+ Jajabor – SEBA Assamese Class 10 Tutor (Free-tier CPU-ready)
4
+ - PDF reading: PyPDF2
5
+ - CPU LLM: google/flan-t5-small (transformers pipeline)
6
+ - Embeddings: sentence-transformers/all-MiniLM-L6-v2
7
+ - FAISS for retrieval
8
+ - OCR via pytesseract
9
+ - SymPy for math solving
10
+ - Gradio UI (gr.Image uses type="filepath")
11
+ Notes:
12
+ - requirements.txt must include: PyPDF2 (capitalized), gradio==4.44.0, gradio-client==0.4.3, sentence-transformers, faiss-cpu, transformers, torch, pytesseract, pillow, sympy
13
  """
14
 
15
  import os
16
  import io
17
  import sqlite3
 
18
  import traceback
19
+ from datetime import datetime
20
 
21
+ from PyPDF2 import PdfReader
22
  import numpy as np
23
  from PIL import Image
24
  import gradio as gr
 
29
 
30
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
31
 
32
+ # -------------------- CONFIG --------------------
33
+ APP_NAME = "Jajabor – SEBA Assamese Class 10 Tutor (Free CPU)"
34
 
35
  BASE_DIR = os.path.abspath(os.path.dirname(__file__))
36
  PDF_DIR = os.path.join(BASE_DIR, "pdfs", "class10")
37
  DB_PATH = os.path.join(BASE_DIR, "jajabor_users.db")
38
 
39
  EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
40
+ USE_HF_INFERENCE = False # Free plan: use local small model
 
41
  LLM_LOCAL_NAME = "google/flan-t5-small"
42
  LLM_MAX_TOKENS = 128
43
 
 
45
  CHUNK_OVERLAP = 120
46
  TOP_K = 5
47
 
48
+ # -------------------- DATABASE --------------------
49
  def init_db(path=DB_PATH):
50
  os.makedirs(os.path.dirname(path), exist_ok=True)
51
  conn = sqlite3.connect(path)
52
  cur = conn.cursor()
53
+ cur.execute(
54
+ """
55
  CREATE TABLE IF NOT EXISTS users (
56
  id INTEGER PRIMARY KEY AUTOINCREMENT,
57
  username TEXT UNIQUE,
58
  created_at TEXT
59
  )
60
+ """
61
+ )
62
+ cur.execute(
63
+ """
64
  CREATE TABLE IF NOT EXISTS interactions (
65
  id INTEGER PRIMARY KEY AUTOINCREMENT,
66
  user_id INTEGER,
 
70
  is_math INTEGER,
71
  FOREIGN KEY(user_id) REFERENCES users(id)
72
  )
73
+ """
74
+ )
75
  conn.commit()
76
  conn.close()
77
 
78
+ def get_or_create_user(username: str):
79
+ username = username.strip()
80
+ if not username:
81
+ return None
82
  conn = sqlite3.connect(DB_PATH)
83
  cur = conn.cursor()
84
  cur.execute("SELECT id FROM users WHERE username=?", (username,))
85
  row = cur.fetchone()
86
  if row:
87
+ user_id = row[0]
88
  else:
89
  cur.execute(
90
  "INSERT INTO users (username, created_at) VALUES (?, ?)",
91
+ (username, datetime.utcnow().isoformat()),
92
  )
93
  conn.commit()
94
+ user_id = cur.lastrowid
95
  conn.close()
96
+ return user_id
97
 
98
+ def log_interaction(user_id, query, answer, is_math: bool):
99
  conn = sqlite3.connect(DB_PATH)
100
  cur = conn.cursor()
101
+ cur.execute(
102
+ """
103
  INSERT INTO interactions (user_id, timestamp, query, answer, is_math)
104
  VALUES (?, ?, ?, ?, ?)
105
+ """,
106
+ (user_id, datetime.utcnow().isoformat(), query, answer, 1 if is_math else 0),
107
+ )
108
  conn.commit()
109
  conn.close()
110
 
111
+ def get_user_stats(user_id):
112
  conn = sqlite3.connect(DB_PATH)
113
  cur = conn.cursor()
114
+ cur.execute(
115
+ "SELECT COUNT(*), SUM(is_math) FROM interactions WHERE user_id=?", (user_id,)
116
+ )
117
  row = cur.fetchone()
118
  conn.close()
119
+ total = row[0] or 0
120
+ math_count = row[1] or 0
121
+ return total, math_count
122
+
123
+ init_db()
124
 
125
+ # -------------------- PDF reading (PyPDF2) --------------------
126
+ def extract_text_from_pdf(pdf_path: str) -> str:
127
+ text_pages = []
128
  try:
129
  reader = PdfReader(pdf_path)
130
  for page in reader.pages:
131
  try:
132
+ txt = page.extract_text() or ""
133
+ text_pages.append(txt)
134
+ except Exception:
135
  continue
136
  except Exception as e:
137
  print("PDF read error:", e)
138
+ return "\n".join(text_pages)
139
 
140
+ def load_all_pdfs(pdf_dir: str):
141
  texts = []
142
  metas = []
143
  if not os.path.isdir(pdf_dir):
144
+ print("PDF_DIR not found:", pdf_dir)
145
  return texts, metas
146
+ for fname in sorted(os.listdir(pdf_dir)):
147
+ if fname.lower().endswith(".pdf"):
148
+ path = os.path.join(pdf_dir, fname)
149
+ print("Reading:", path)
150
+ text = extract_text_from_pdf(path)
151
+ texts.append(text)
152
+ metas.append({"source": fname})
 
153
  return texts, metas
154
 
155
+ def split_text(text: str, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
156
+ if not text:
157
  return []
158
+ chunks = []
159
+ step = max(chunk_size - overlap, 1)
160
+ start = 0
161
+ L = len(text)
162
+ while start < L:
163
+ end = min(start + chunk_size, L)
164
+ chunk = text[start:end]
165
+ if chunk.strip():
166
+ chunks.append(chunk)
167
+ start += step
168
+ return chunks
169
+
170
+ # -------------------- Embeddings + FAISS --------------------
171
+ print("Loading embedding model:", EMBEDDING_MODEL_NAME)
172
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
173
+
174
+ print("Loading PDFs from", PDF_DIR)
175
+ all_texts, all_metas = load_all_pdfs(PDF_DIR)
176
+ print("Number of PDFs:", len(all_texts))
177
+
178
+ corpus_chunks = []
179
+ corpus_metas = []
180
+ for text, meta in zip(all_texts, all_metas):
181
+ chs = split_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
182
+ corpus_chunks.extend(chs)
183
+ corpus_metas.extend([meta] * len(chs))
184
+
185
+ print("Total chunks:", len(corpus_chunks))
186
  index = None
187
+ if len(corpus_chunks) > 0:
188
+ print("Encoding chunks (this may take some seconds)...")
189
  try:
190
+ embs = embedding_model.encode(corpus_chunks, batch_size=32, show_progress_bar=False).astype("float32")
191
+ dim = embs.shape[1]
192
+ index = faiss.IndexFlatL2(dim)
193
+ index.add(embs)
194
+ print("FAISS index ready; dim:", dim)
195
  except Exception as e:
196
+ print("Failed to encode/add to index:", e)
197
+ index = None
198
+ else:
199
+ print("No corpus chunks found: upload PDFs to ./pdfs/class10")
200
 
201
+ def rag_search(query: str, k: int = TOP_K):
202
  if index is None:
203
  return []
204
  try:
205
+ q_vec = embedding_model.encode([query]).astype("float32")
206
+ D, I = index.search(q_vec, k)
207
+ results = []
208
+ for dist, idx in zip(D[0], I[0]):
209
+ if idx == -1:
210
+ continue
211
+ results.append(
212
+ {
213
+ "score": float(dist),
214
+ "text": corpus_chunks[idx],
215
+ "meta": corpus_metas[idx],
216
+ }
217
+ )
218
+ return results
219
+ except Exception as e:
220
+ print("RAG search error:", e)
221
  return []
222
 
223
+ # -------------------- Local CPU LLM (flan-t5-small) --------------------
224
+ print("Loading local CPU LLM:", LLM_LOCAL_NAME)
225
+ llm_pipe = None
226
  try:
227
+ tokenizer = AutoTokenizer.from_pretrained(LLM_LOCAL_NAME)
228
+ model = AutoModelForSeq2SeqLM.from_pretrained(LLM_LOCAL_NAME)
229
+ llm_pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device_map=None)
230
+ print("Local LLM loaded.")
231
+ except Exception as e:
232
+ print("Failed to load local LLM (will return notice):", e)
233
+ llm_pipe = None
234
 
235
  SYSTEM_PROMPT = """
236
+ You are "Jajabor", an expert SEBA Assamese tutor for Class 10.
237
+ Always prefer to answer in Assamese. If the student clearly asks for English, you may reply in English.
238
+
239
+ Rules:
240
+ - Use ONLY the given textbook context.
241
+ - If you are not sure, say: "এই প্ৰশ্নটো পাঠ্যপুথিৰ অংশত স্পষ্টকৈ নাই, সেয়েহে মই নিশ্চিত নহয়।"
242
+ - বোঝাপৰা সহজ ভাষাত ব্যাখ্যা কৰা, উদাহৰণ দিয়ক।
243
+ - If it is a maths question, explain step-by-step clearly.
244
  """
245
 
246
+ def build_rag_prompt(context_blocks, question, chat_history):
247
+ ctx = ""
248
+ for i, block in enumerate(context_blocks, start=1):
249
+ src = block["meta"].get("source", "textbook")
250
+ ctx += f"\n[Context {i} – {src}]\n{block['text']}\n"
 
 
 
 
 
 
251
 
252
+ hist = ""
253
+ for role, msg in chat_history:
254
+ hist += f"{role}: {msg}\n"
255
 
256
+ prompt = f"""{SYSTEM_PROMPT}
 
257
 
258
+ পূৰ্বৰ বাৰ্তাসমূহ:
259
+ {hist}
260
 
261
+ সদস্যৰ প্ৰশ্ন:
262
+ {question}
 
263
 
264
+ সম্পৰ্কিত পাঠ্যপুথিৰ অংশ:
265
+ {ctx}
266
 
267
+ এতিয়া একেদম সহায়ক আৰু বুজিবলৈ সহজ উত্তৰ দিয়া।
268
+ """
269
+ return prompt
 
 
270
 
271
+ def llm_answer_with_rag(question: str, chat_history):
272
+ retrieved = rag_search(question, TOP_K)
273
+ prompt = build_rag_prompt(retrieved, question, chat_history)
274
+ if USE_HF_INFERENCE:
275
+ return "HF inference disabled in free plan."
276
+ else:
277
+ if llm_pipe is None:
278
+ return "Local LLM not loaded. Ensure model weights are available on first run."
279
+ try:
280
+ out = llm_pipe(prompt, max_new_tokens=LLM_MAX_TOKENS, do_sample=False)
281
+ if isinstance(out, list) and len(out) > 0 and "generated_text" in out[0]:
282
+ return out[0]["generated_text"]
283
+ if isinstance(out, list) and len(out) > 0 and isinstance(out[0], str):
284
+ return out[0]
285
+ if isinstance(out, dict) and "generated_text" in out:
286
+ return out["generated_text"]
287
+ return str(out)
288
+ except Exception as e:
289
+ traceback.print_exc()
290
+ return f"LLM generation failed: {e}"
291
+
292
+ # -------------------- OCR + Math helpers --------------------
293
+ def ocr_from_image(img: Image.Image):
294
+ if img is None:
295
+ return ""
296
  try:
297
  img = img.convert("RGB")
298
+ except Exception:
299
+ pass
300
+ try:
301
+ text = pytesseract.image_to_string(img, lang="asm+eng")
302
+ except Exception:
303
+ try:
304
+ text = pytesseract.image_to_string(img)
305
+ except Exception:
306
+ text = ""
307
+ return text.strip()
308
+
309
+ def is_likely_math(text: str) -> bool:
310
+ if not text:
311
+ return False
312
+ math_chars = set("0123456789+-*/=^()%")
313
+ if any(ch in text for ch in math_chars):
314
  return True
315
+ kws = ["গণিত", "সমীকৰণ", "উদাহৰণ", "প্ৰশ্ন", "বীজগণিত", "solve", "equation"]
316
+ return any(k in text for k in kws)
317
 
318
+ def solve_math_expression(expr: str):
319
  try:
320
  expr = expr.replace("^", "**")
321
  if "=" in expr:
322
+ left, right = expr.split("=", 1)
323
+ left_s = sp.sympify(left)
324
+ right_s = sp.sympify(right)
325
+ eq = sp.Eq(left_s, right_s)
326
  sol = sp.solve(eq)
327
+ steps = [
328
+ "প্ৰথমে সমীকৰণ লওঁ:",
329
+ f"{sp.pretty(eq)}",
330
+ "Sympy ৰ সহায়ত সমাধান পোৱা যায়:",
331
+ str(sol),
332
+ ]
333
+ explanation = "ধাপ-ধাপে সমাধান (সংক্ষেপে):\n" + "\n".join(f"- {s}" for s in steps)
334
+ explanation += f"\n\nসেয়ে সমাধান: {sol}"
335
  else:
336
+ expr_s = sp.sympify(expr)
337
+ simp = sp.simplify(expr_s)
338
+ explanation = (
339
+ "প্ৰদত্ত গণিতীয় অভিব্যক্তি:\n"
340
+ f"{expr}\n\nসরলীকৰণ কৰাৰ পিছত পোৱা যায়:\n{simp}"
341
+ )
342
+ return explanation
343
+ except Exception:
344
+ return (
345
+ "মই সঠিকভাৱে গণিতীয় অভিব্যক্তি চিনাক্ত কৰিব নোৱাৰিলোঁ। "
346
+ "দয়া কৰি সমীকৰণটো অলপ বেছি স্পষ্ট কৰি লিখক: উদাহৰণ – 2*x + 3 = 7"
347
+ )
 
 
 
 
 
 
348
 
349
+ def speech_to_text(audio):
350
+ return ""
351
 
352
+ def text_to_speech(text: str):
353
+ # stub: return empty string to avoid None in Gradio outputs
354
+ return ""
355
 
356
+ # -------------------- Chat logic --------------------
357
+ def login_user(username, user_state):
358
+ username = (username or "").strip()
359
+ if not username:
360
+ return user_state, "⚠️ অনুগ্ৰহ কৰি প্ৰথমে লগিনৰ বাবে এটা নাম লিখক।"
361
+ user_id = get_or_create_user(username)
362
+ user_state = {"username": username, "user_id": user_id}
363
+ total, math_count = get_user_stats(user_id)
364
+ stats = (
365
+ f"👤 ব্যৱহাৰকাৰী: **{username}**\n\n"
366
+ f"📊 মোট প্ৰশ্ন: **{total}**\n"
367
+ f"🧮 গণিত প্ৰশ্ন: **{math_count}**"
368
+ )
369
+ return user_state, stats
370
+
371
+ def chat_logic(
372
+ username,
373
+ text_input,
374
+ image_input,
375
+ audio_input,
376
+ chat_history,
377
+ user_state,
378
+ ):
379
+ if chat_history is None:
380
+ chat_history = []
381
+
382
+ if not user_state or not user_state.get("user_id"):
383
+ sys_msg = "⚠️ প্ৰথমে ওপৰত আপোনাৰ নাম লিখি **Login / লগিন** টিপক।"
384
+ chat_history = chat_history + [[text_input or "", sys_msg]]
385
+ return chat_history, user_state, ""
386
+
387
+ user_id = user_state["user_id"]
388
+ final_query_parts = []
389
+
390
+ voice_text = speech_to_text(audio_input)
391
+ if voice_text:
392
+ final_query_parts.append(voice_text)
393
+
394
+ ocr_text = ""
395
+ if image_input is not None and image_input != "":
396
+ img = None
397
  try:
398
+ if isinstance(image_input, str):
399
+ img = Image.open(image_input)
400
+ else:
401
+ read_method = getattr(image_input, "read", None)
402
+ if callable(read_method):
403
+ raw = image_input.read()
404
+ img = Image.open(io.BytesIO(raw))
405
+ if img is None and isinstance(image_input, Image.Image):
406
+ img = image_input
407
+ except Exception:
408
+ img = None
409
+
410
+ if img is not None:
411
+ try:
412
+ ocr_text = ocr_from_image(img)
413
+ if ocr_text:
414
+ final_query_parts.append(ocr_text)
415
+ except Exception:
416
+ pass
417
+
418
+ if text_input:
419
+ final_query_parts.append(text_input)
420
 
421
+ if not final_query_parts:
422
+ sys_msg = "⚠️ অনুগ্ৰহ কৰি প্ৰশ্ন লিখক, কিম্বা ছবি আপলোড কৰক।"
423
+ chat_history = chat_history + [["", sys_msg]]
424
+ return chat_history, user_state, ""
425
 
426
+ full_query = "\n".join(final_query_parts)
427
 
428
  conv = []
429
+ for u, b in chat_history:
430
  if u:
431
  conv.append(("Student", u))
432
+ if b:
433
+ conv.append(("Tutor", b))
434
+
435
+ is_math = is_likely_math(full_query)
436
+
437
+ if is_math:
438
+ math_answer = solve_math_expression(full_query)
439
+ combined_question = (
440
+ full_query
441
+ + "\n\nগণিত প্ৰোগ্ৰামে এই ফলাফল দিছে:\n"
442
+ + math_answer
443
+ + "\n\nঅনুগ্ৰহ কৰি শ্রেণী ১০ ৰ শিক্ষাৰ্থীৰ বাবে সহজ ভাষাত ব্যাখ্যা কৰক।"
444
+ )
445
+ final_answer = llm_answer_with_rag(combined_question, conv)
446
  else:
447
+ final_answer = llm_answer_with_rag(full_query, conv)
448
 
449
+ if final_answer is None:
450
+ final_answer = "মাফ কৰক মই ইয়াৰ উত্তর দিব পৰা নাই।"
451
 
452
+ log_interaction(user_id, full_query, final_answer, is_math)
453
+ audio_out = text_to_speech(final_answer) or ""
454
+ display_question = text_input or voice_text or ocr_text or "(empty)"
455
+ chat_history = chat_history + [[display_question, final_answer]]
456
 
457
+ return chat_history, user_state, audio_out
 
 
458
 
459
+ # -------------------- Gradio UI --------------------
460
+ with gr.Blocks(title=APP_NAME, css=None) as demo:
461
+ gr.Markdown(
462
+ """
463
+ # 🧭 জাজাবৰ – SEBA অসমীয়া ক্লাছ ১০ AI Tutor (Free CPU)
464
+
465
+ - Upload your SEBA Class 10 PDFs to `pdfs/class10` in this repo (or when running locally, ensure folder exists)
466
+ - Text + Image (OCR) input
467
+ - Math step-by-step solutions
468
+ - User login + progress
469
+ """
470
+ )
471
+
472
+ user_state = gr.State({})
473
 
474
  with gr.Row():
475
  with gr.Column(scale=1):
476
+ gr.Markdown("### 👤 লগিন")
477
+ username_inp = gr.Textbox(
478
+ label="নাম / ইউজাৰ আইডি",
479
+ placeholder="উদাহৰণ: abu10, student01 ..."
480
+ )
481
+ login_btn = gr.Button("✅ Login / লগিন")
482
+ stats_md = gr.Markdown("এতিয়ালৈকে লগিন হোৱা নাই।", elem_classes="stats-box")
483
+
484
+ gr.Markdown(
485
+ """
486
+ ### 💡 টিপছ
487
+ - "ক্লাছ ১০ গণিত: উদাহৰণ ৩.১ প্ৰশ্ন ২" – এই ধৰণৰ প্ৰশ্ন ভাল
488
+ - ফটো আপলোড কৰিলে টেক্স্টটো OCR কৰি পঢ়িব চেষ্টা কৰা হয়
489
+ - সম্ভৱ হলে প্ৰশ্নটো অসমীয়াত সোধক 🙂
490
+ """
491
+ )
492
+
493
  with gr.Column(scale=3):
494
+ chat = gr.Chatbot(label="জাজাবৰ সৈতে কথোপকথন", height=500)
495
+ text_inp = gr.Textbox(
496
+ label="আপোনাৰ প্ৰশ্ন লিখক",
497
+ placeholder='উদাহৰণ: "ক্লাছ ১০ অসমীয়া: অনুচ্ছেদ পাঠ ১ ৰ মূল বিষয় কি?"',
498
+ lines=2,
499
+ )
500
+
501
+ with gr.Row():
502
+ image_inp = gr.Image(label="📷 প্ৰশ্নৰ ছবি (Optional)", type="filepath")
503
+ audio_inp = gr.Audio(label="🎙️ কণ্ঠস্বৰ প্ৰশ্ন (Stub — not used now)", type="numpy")
504
+
505
+ with gr.Row():
506
+ ask_btn = gr.Button("🤖 জাজাবৰক সোধক")
507
+ audio_out = gr.Audio(
508
+ label="🔊 উত্তৰৰ অডিঅ’ (TTS – future upgrade)",
509
+ interactive=False,
510
+ type="filepath"
511
+ )
512
+
513
+ login_btn.click(
514
+ login_user,
515
+ inputs=[username_inp, user_state],
516
+ outputs=[user_state, stats_md],
517
+ )
518
+
519
+ def wrapped_chat(text, image, audio, history, user_state_inner, username_inner):
520
+ if user_state_inner is None:
521
+ user_state_inner = {}
522
+ if username_inner and not user_state_inner.get("username"):
523
+ user_state_inner["username"] = username_inner
524
+ return chat_logic(username_inner, text, image, audio, history, user_state_inner)
525
+
526
+ ask_btn.click(
527
+ wrapped_chat,
528
+ inputs=[text_inp, image_inp, audio_inp, chat, user_state, username_inp],
529
+ outputs=[chat, user_state, audio_out],
530
  )
531
 
532
+ text_inp.submit(
533
+ wrapped_chat,
534
+ inputs=[text_inp, image_inp, audio_inp, chat, user_state, username_inp],
535
+ outputs=[chat, user_state, audio_out],
536
  )
537
 
538
+ # -------------------- Launch --------------------
539
  if __name__ == "__main__":
540
+ # bind to 0.0.0.0 and allow share link for hosted environments where localhost may be blocked
541
  demo.launch(server_name="0.0.0.0", server_port=7860, share=True)