Spaces:

aimanathar
/

virtual_trainr

Sleeping

App Files Files Community

aimanathar commited on Sep 13, 2025

Commit

c29482e

verified ·

1 Parent(s): b007a3e

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -112

app.py DELETED Viewed

@@ -1,112 +0,0 @@
-import textwrap
-import warnings
-import faiss
-import gradio as gr
-import numpy as np
-import pytesseract
-import torch
-from datasets import load_dataset  # ✅ Hugging Face Dataset
-from pdf2image import convert_from_path
-from pdfminer.high_level import extract_text
-from sentence_transformers import SentenceTransformer
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-warnings.filterwarnings("ignore")
-# ================== PDF Handling ==================
-def pdf_to_text(path):
-    try:
-        txt = extract_text(path) or ""
-    except Exception:
-        txt = ""
-    if len(txt.strip()) < 200:
-        try:
-            pages = convert_from_path(path, dpi=200)
-            ocr_all = [pytesseract.image_to_string(img) for img in pages]
-            txt = "\n".join(ocr_all)
-        except Exception:
-            txt = ""
-    return txt
-def chunk_text(text, max_chars=800):
-    paras = [p.strip() for p in text.split("\n") if p.strip()]
-    chunks, buf = [], ""
-    for p in paras:
-        if len(p) > max_chars:
-            for piece in textwrap.wrap(p, width=max_chars, break_long_words=False):
-                chunks.append(piece.strip())
-        else:
-            if len(buf) + len(p) + 1 <= max_chars:
-                buf = (buf + "\n" + p).strip()
-            else:
-                if buf:
-                    chunks.append(buf)
-                buf = p
-    if buf:
-        chunks.append(buf)
-    return [c for c in chunks if len(c) > 80]
-# ================== Load Dataset from Hugging Face ==================
-print("⬇️ Loading Hugging Face dataset...")
-ds = load_dataset("aimanathar/virtualtranr")   # replace with your dataset name
-dataset_texts = []
-# assume dataset has a "text" column
-for item in ds["train"]:
-    if "text" in item and item["text"].strip():
-        dataset_texts.append(item["text"].strip())
-print(f"✅ Loaded {len(dataset_texts)} rows from dataset.")
-# ================== Make Chunks ==================
-all_chunks = []
-for t in dataset_texts:
-    all_chunks.extend(chunk_text(t))
-print(f"✅ Created {len(all_chunks)} chunks from dataset.")
-# ================== Embeddings + FAISS ==================
-embed_model = SentenceTransformer("all-MiniLM-L6-v2")
-emb = embed_model.encode(all_chunks, normalize_embeddings=True).astype("float32")
-dim = emb.shape[1]
-index = faiss.IndexFlatIP(dim)
-index.add(emb)
-def retrieve(query, k=5):
-    q = embed_model.encode([query], normalize_embeddings=True).astype("float32")
-    D, I = index.search(q, k)
-    ctx = "\n\n".join(all_chunks[i] for i in I[0])
-    return ctx
-# ================== Model ==================
-model_id = "google/flan-t5-base"
-tok = AutoTokenizer.from_pretrained(model_id)
-gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-gen_model.to(device)
-# ================== Chat Function ==================
-def chat_fn(message, history=None):
-    context = retrieve(message, k=5)  # dataset se context
-    prompt = f"Context:\n{context}\n\nQuestion:\n{message}\n\nAnswer clearly and exam-ready."
-    inputs = tok(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
-    out = gen_model.generate(**inputs, max_new_tokens=120, num_beams=4, do_sample=False)
-    return tok.decode(out[0], skip_special_tokens=True).strip()
-# ================== Gradio Interface ==================
-iface = gr.ChatInterface(
-    fn=chat_fn,
-    title="💬 Practical Chatbot",
-    description="Ask about Physics & Chemistry Practicals (Class 9–10). Powered by Hugging Face dataset + PDFs."
-)
-iface.launch()