Spaces:

aimanathar
/

virtual_trainr

Sleeping

App Files Files Community

aimanathar commited on Sep 13, 2025

Commit

befffdd

verified ·

1 Parent(s): c29482e

Upload app.py

Browse files

Files changed (1) hide show

app.py +116 -0

app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import textwrap
+import warnings
+import faiss
+import gradio as gr
+import numpy as np
+import pdfplumber
+import pytesseract
+import torch
+from datasets import load_dataset
+from pdf2image import convert_from_path
+from pdfminer.high_level import extract_text
+from sentence_transformers import SentenceTransformer
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+warnings.filterwarnings("ignore")
+# ================== PDF Handling Functions ==================
+def pdf_to_text(path):
+    try:
+        txt = extract_text(path) or ""
+    except Exception:
+        txt = ""
+    if len(txt.strip()) < 200:  # fallback OCR
+        try:
+            pages = convert_from_path(path, dpi=200)
+            ocr_all = [pytesseract.image_to_string(img) for img in pages]
+            txt = "\n".join(ocr_all)
+        except Exception:
+            txt = ""
+    return txt
+def chunk_text(text, max_chars=800):
+    paras = [p.strip() for p in text.split("\n") if p.strip()]
+    chunks, buf = [], ""
+    for p in paras:
+        if len(p) > max_chars:
+            for piece in textwrap.wrap(p, width=max_chars, break_long_words=False):
+                chunks.append(piece.strip())
+        else:
+            if len(buf) + len(p) + 1 <= max_chars:
+                buf = (buf + "\n" + p).strip()
+            else:
+                if buf:
+                    chunks.append(buf)
+                buf = p
+    if buf:
+        chunks.append(buf)
+    return [c for c in chunks if len(c) > 80]
+# ================== Load Dataset ==================
+print("📥 Loading dataset...")
+ds = load_dataset("aimanathar/virtualtranr")
+all_texts = []
+for item in ds["train"]:
+    if "file" in item:   # agar dataset me file column hai
+        with open(item["file"], "rb") as f:
+            try:
+                with pdfplumber.open(f) as pdf:
+                    txt = "\n".join([page.extract_text() or "" for page in pdf.pages])
+            except Exception:
+                txt = pdf_to_text(item["file"])
+        if txt and len(txt.strip()) > 80:
+            all_texts.append(txt)
+if not all_texts:
+    all_texts = ["Sample fallback text about Physics & Chemistry Practicals"]
+print(f"✅ Extracted {len(all_texts)} documents.")
+# ================== Chunk + Embeddings ==================
+chunks = []
+for t in all_texts:
+    chunks.extend(chunk_text(t, 800))
+print(f"✅ Total chunks: {len(chunks)}")
+embed_model = SentenceTransformer("all-MiniLM-L6-v2")
+emb = embed_model.encode(chunks, normalize_embeddings=True).astype("float32")
+dim = emb.shape[1]
+index = faiss.IndexFlatIP(dim)
+index.add(emb)
+def retrieve(query, k=5):
+    q = embed_model.encode([query], normalize_embeddings=True).astype("float32")
+    D, I = index.search(q, k)
+    ctx = "\n\n".join(chunks[i] for i in I[0])
+    return ctx
+# ================== Load Model ==================
+model_id = "google/flan-t5-base"
+tok = AutoTokenizer.from_pretrained(model_id)
+gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+gen_model.to(device)
+# ================== Chat Function ==================
+def chat_fn(message, history=None):
+    context = retrieve(message)
+    prompt = f"Context:\n{context}\n\nQuestion:\n{message}\n\nAnswer clearly and exam-ready."
+    inputs = tok(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
+    out = gen_model.generate(**inputs, max_new_tokens=120, num_beams=4, do_sample=False)
+    return tok.decode(out[0], skip_special_tokens=True).strip()
+# ================== Gradio Interface ==================
+iface = gr.ChatInterface(
+    fn=chat_fn,
+    title="💬 Practical Chatbot",
+    description="Ask about Physics & Chemistry Practicals (Class 9–10)."
+)
+iface.launch()