Spaces:

aimanathar
/

virtual_trainr

Sleeping

App Files Files Community

aimanathar commited on Sep 12, 2025

Commit

0e54eef

verified ·

1 Parent(s): ee62456

Upload app.py

Browse files

Files changed (1) hide show

app.py +112 -0

app.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import textwrap
+import warnings
+import faiss
+import gradio as gr
+import numpy as np
+import pytesseract
+import torch
+from datasets import load_dataset  # ✅ Hugging Face Dataset
+from pdf2image import convert_from_path
+from pdfminer.high_level import extract_text
+from sentence_transformers import SentenceTransformer
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+warnings.filterwarnings("ignore")
+# ================== PDF Handling ==================
+def pdf_to_text(path):
+    try:
+        txt = extract_text(path) or ""
+    except Exception:
+        txt = ""
+    if len(txt.strip()) < 200:
+        try:
+            pages = convert_from_path(path, dpi=200)
+            ocr_all = [pytesseract.image_to_string(img) for img in pages]
+            txt = "\n".join(ocr_all)
+        except Exception:
+            txt = ""
+    return txt
+def chunk_text(text, max_chars=800):
+    paras = [p.strip() for p in text.split("\n") if p.strip()]
+    chunks, buf = [], ""
+    for p in paras:
+        if len(p) > max_chars:
+            for piece in textwrap.wrap(p, width=max_chars, break_long_words=False):
+                chunks.append(piece.strip())
+        else:
+            if len(buf) + len(p) + 1 <= max_chars:
+                buf = (buf + "\n" + p).strip()
+            else:
+                if buf:
+                    chunks.append(buf)
+                buf = p
+    if buf:
+        chunks.append(buf)
+    return [c for c in chunks if len(c) > 80]
+# ================== Load Dataset from Hugging Face ==================
+print("⬇️ Loading Hugging Face dataset...")
+ds = load_dataset("aimanathar/virtualtranr")   # replace with your dataset name
+dataset_texts = []
+# assume dataset has a "text" column
+for item in ds["train"]:
+    if "text" in item and item["text"].strip():
+        dataset_texts.append(item["text"].strip())
+print(f"✅ Loaded {len(dataset_texts)} rows from dataset.")
+# ================== Make Chunks ==================
+all_chunks = []
+for t in dataset_texts:
+    all_chunks.extend(chunk_text(t))
+print(f"✅ Created {len(all_chunks)} chunks from dataset.")
+# ================== Embeddings + FAISS ==================
+embed_model = SentenceTransformer("all-MiniLM-L6-v2")
+emb = embed_model.encode(all_chunks, normalize_embeddings=True).astype("float32")
+dim = emb.shape[1]
+index = faiss.IndexFlatIP(dim)
+index.add(emb)
+def retrieve(query, k=5):
+    q = embed_model.encode([query], normalize_embeddings=True).astype("float32")
+    D, I = index.search(q, k)
+    ctx = "\n\n".join(all_chunks[i] for i in I[0])
+    return ctx
+# ================== Model ==================
+model_id = "google/flan-t5-base"
+tok = AutoTokenizer.from_pretrained(model_id)
+gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+gen_model.to(device)
+# ================== Chat Function ==================
+def chat_fn(message, history=None):
+    context = retrieve(message, k=5)  # dataset se context
+    prompt = f"Context:\n{context}\n\nQuestion:\n{message}\n\nAnswer clearly and exam-ready."
+    inputs = tok(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
+    out = gen_model.generate(**inputs, max_new_tokens=120, num_beams=4, do_sample=False)
+    return tok.decode(out[0], skip_special_tokens=True).strip()
+# ================== Gradio Interface ==================
+iface = gr.ChatInterface(
+    fn=chat_fn,
+    title="💬 Practical Chatbot",
+    description="Ask about Physics & Chemistry Practicals (Class 9–10). Powered by Hugging Face dataset + PDFs."
+)
+iface.launch()