aimanathar commited on
Commit
befffdd
·
verified ·
1 Parent(s): c29482e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -0
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import textwrap
2
+ import warnings
3
+
4
+ import faiss
5
+ import gradio as gr
6
+ import numpy as np
7
+ import pdfplumber
8
+ import pytesseract
9
+ import torch
10
+ from datasets import load_dataset
11
+ from pdf2image import convert_from_path
12
+ from pdfminer.high_level import extract_text
13
+ from sentence_transformers import SentenceTransformer
14
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
15
+
16
+ warnings.filterwarnings("ignore")
17
+
18
+ # ================== PDF Handling Functions ==================
19
+ def pdf_to_text(path):
20
+ try:
21
+ txt = extract_text(path) or ""
22
+ except Exception:
23
+ txt = ""
24
+
25
+ if len(txt.strip()) < 200: # fallback OCR
26
+ try:
27
+ pages = convert_from_path(path, dpi=200)
28
+ ocr_all = [pytesseract.image_to_string(img) for img in pages]
29
+ txt = "\n".join(ocr_all)
30
+ except Exception:
31
+ txt = ""
32
+ return txt
33
+
34
+
35
+ def chunk_text(text, max_chars=800):
36
+ paras = [p.strip() for p in text.split("\n") if p.strip()]
37
+ chunks, buf = [], ""
38
+ for p in paras:
39
+ if len(p) > max_chars:
40
+ for piece in textwrap.wrap(p, width=max_chars, break_long_words=False):
41
+ chunks.append(piece.strip())
42
+ else:
43
+ if len(buf) + len(p) + 1 <= max_chars:
44
+ buf = (buf + "\n" + p).strip()
45
+ else:
46
+ if buf:
47
+ chunks.append(buf)
48
+ buf = p
49
+ if buf:
50
+ chunks.append(buf)
51
+ return [c for c in chunks if len(c) > 80]
52
+
53
+ # ================== Load Dataset ==================
54
+ print("📥 Loading dataset...")
55
+ ds = load_dataset("aimanathar/virtualtranr")
56
+
57
+ all_texts = []
58
+ for item in ds["train"]:
59
+ if "file" in item: # agar dataset me file column hai
60
+ with open(item["file"], "rb") as f:
61
+ try:
62
+ with pdfplumber.open(f) as pdf:
63
+ txt = "\n".join([page.extract_text() or "" for page in pdf.pages])
64
+ except Exception:
65
+ txt = pdf_to_text(item["file"])
66
+ if txt and len(txt.strip()) > 80:
67
+ all_texts.append(txt)
68
+
69
+ if not all_texts:
70
+ all_texts = ["Sample fallback text about Physics & Chemistry Practicals"]
71
+
72
+ print(f"✅ Extracted {len(all_texts)} documents.")
73
+
74
+ # ================== Chunk + Embeddings ==================
75
+ chunks = []
76
+ for t in all_texts:
77
+ chunks.extend(chunk_text(t, 800))
78
+
79
+ print(f"✅ Total chunks: {len(chunks)}")
80
+
81
+ embed_model = SentenceTransformer("all-MiniLM-L6-v2")
82
+ emb = embed_model.encode(chunks, normalize_embeddings=True).astype("float32")
83
+
84
+ dim = emb.shape[1]
85
+ index = faiss.IndexFlatIP(dim)
86
+ index.add(emb)
87
+
88
+ def retrieve(query, k=5):
89
+ q = embed_model.encode([query], normalize_embeddings=True).astype("float32")
90
+ D, I = index.search(q, k)
91
+ ctx = "\n\n".join(chunks[i] for i in I[0])
92
+ return ctx
93
+
94
+ # ================== Load Model ==================
95
+ model_id = "google/flan-t5-base"
96
+ tok = AutoTokenizer.from_pretrained(model_id)
97
+ gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
98
+ device = "cuda" if torch.cuda.is_available() else "cpu"
99
+ gen_model.to(device)
100
+
101
+ # ================== Chat Function ==================
102
+ def chat_fn(message, history=None):
103
+ context = retrieve(message)
104
+ prompt = f"Context:\n{context}\n\nQuestion:\n{message}\n\nAnswer clearly and exam-ready."
105
+ inputs = tok(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
106
+ out = gen_model.generate(**inputs, max_new_tokens=120, num_beams=4, do_sample=False)
107
+ return tok.decode(out[0], skip_special_tokens=True).strip()
108
+
109
+ # ================== Gradio Interface ==================
110
+ iface = gr.ChatInterface(
111
+ fn=chat_fn,
112
+ title="💬 Practical Chatbot",
113
+ description="Ask about Physics & Chemistry Practicals (Class 9–10)."
114
+ )
115
+
116
+ iface.launch()