Spaces:

aimanathar
/

virtual_trainr

Sleeping

App Files Files Community

aimanathar commited on Sep 13, 2025

Commit

dea984d

verified ·

1 Parent(s): 12192cf

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -119

app.py DELETED Viewed

@@ -1,119 +0,0 @@
-import textwrap
-import warnings
-import faiss
-import gradio as gr
-import numpy as np
-import pdfplumber
-import pytesseract
-import torch
-from datasets import load_dataset
-from pdf2image import convert_from_path
-from pdfminer.high_level import extract_text
-from sentence_transformers import SentenceTransformer
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-warnings.filterwarnings("ignore")
-# ================== PDF Handling Functions ==================
-def pdf_to_text(path):
-    try:
-        txt = extract_text(path) or ""
-    except Exception:
-        txt = ""
-    if len(txt.strip()) < 200:  # fallback OCR
-        try:
-            pages = convert_from_path(path, dpi=200)
-            ocr_all = [pytesseract.image_to_string(img) for img in pages]
-            txt = "\n".join(ocr_all)
-        except Exception:
-            txt = ""
-    return txt
-def chunk_text(text, max_chars=800):
-    paras = [p.strip() for p in text.split("\n") if p.strip()]
-    chunks, buf = [], ""
-    for p in paras:
-        if len(p) > max_chars:
-            for piece in textwrap.wrap(p, width=max_chars, break_long_words=False):
-                chunks.append(piece.strip())
-        else:
-            if len(buf) + len(p) + 1 <= max_chars:
-                buf = (buf + "\n" + p).strip()
-            else:
-                if buf:
-                    chunks.append(buf)
-                buf = p
-    if buf:
-        chunks.append(buf)
-    return [c for c in chunks if len(c) > 80]
-# ================== Load Dataset ==================
-print("📥 Loading dataset...")
-ds = load_dataset("aimanathar/virtualtranr")
-all_texts = []
-for item in ds["train"]:
-    if "file" in item:   # agar dataset me file column hai
-        with open(item["file"], "rb") as f:
-            try:
-                with pdfplumber.open(f) as pdf:
-                    txt = "\n".join([page.extract_text() or "" for page in pdf.pages])
-            except Exception:
-                txt = pdf_to_text(item["file"])
-        if txt and len(txt.strip()) > 80:
-            all_texts.append(txt)
-if not all_texts:
-    all_texts = ["Sample fallback text about Physics & Chemistry Practicals"]
-print(f"✅ Extracted {len(all_texts)} documents.")
-# ================== Chunk + Embeddings ==================
-chunks = []
-for t in all_texts:
-    chunks.extend(chunk_text(t, 800))
-if not chunks:
-    print("⚠️ Warning: No chunks extracted. Using fallback text.")
-    chunks = [
-        "This is a fallback context. The dataset PDFs could not be chunked. "
-        "The chatbot will still run, but answers may be generic."
-    ]
-print(f"✅ Total chunks: {len(chunks)}")
-embed_model = SentenceTransformer("all-MiniLM-L6-v2")
-emb = embed_model.encode(chunks, normalize_embeddings=True).astype("float32")
-dim = emb.shape[1]
-index = faiss.IndexFlatIP(dim)
-index.add(emb)
-# ================== Load Model ==================
-model_id = "google/flan-t5-base"
-tok = AutoTokenizer.from_pretrained(model_id)
-gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-gen_model.to(device)
-# ================== Chat Function ==================
-def chat_fn(message, history=None):
-    context = retrieve(message)
-    prompt = f"Context:\n{context}\n\nQuestion:\n{message}\n\nAnswer clearly and exam-ready."
-    inputs = tok(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
-    out = gen_model.generate(**inputs, max_new_tokens=120, num_beams=4, do_sample=False)
-    return tok.decode(out[0], skip_special_tokens=True).strip()
-# ================== Gradio Interface ==================
-iface = gr.ChatInterface(
-    fn=chat_fn,
-    title="💬 Practical Chatbot",
-    description="Ask about Physics & Chemistry Practicals (Class 9–10)."
-)
-iface.launch()