aimanathar commited on
Commit
12192cf
·
verified ·
1 Parent(s): 1c32bdc

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import textwrap
2
+ import warnings
3
+
4
+ import faiss
5
+ import gradio as gr
6
+ import numpy as np
7
+ import pdfplumber
8
+ import pytesseract
9
+ import torch
10
+ from datasets import load_dataset
11
+ from pdf2image import convert_from_path
12
+ from pdfminer.high_level import extract_text
13
+ from sentence_transformers import SentenceTransformer
14
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
15
+
16
+ warnings.filterwarnings("ignore")
17
+
18
+ # ================== PDF Handling Functions ==================
19
+ def pdf_to_text(path):
20
+ try:
21
+ txt = extract_text(path) or ""
22
+ except Exception:
23
+ txt = ""
24
+
25
+ if len(txt.strip()) < 200: # fallback OCR
26
+ try:
27
+ pages = convert_from_path(path, dpi=200)
28
+ ocr_all = [pytesseract.image_to_string(img) for img in pages]
29
+ txt = "\n".join(ocr_all)
30
+ except Exception:
31
+ txt = ""
32
+ return txt
33
+
34
+
35
+ def chunk_text(text, max_chars=800):
36
+ paras = [p.strip() for p in text.split("\n") if p.strip()]
37
+ chunks, buf = [], ""
38
+ for p in paras:
39
+ if len(p) > max_chars:
40
+ for piece in textwrap.wrap(p, width=max_chars, break_long_words=False):
41
+ chunks.append(piece.strip())
42
+ else:
43
+ if len(buf) + len(p) + 1 <= max_chars:
44
+ buf = (buf + "\n" + p).strip()
45
+ else:
46
+ if buf:
47
+ chunks.append(buf)
48
+ buf = p
49
+ if buf:
50
+ chunks.append(buf)
51
+ return [c for c in chunks if len(c) > 80]
52
+
53
+ # ================== Load Dataset ==================
54
+ print("📥 Loading dataset...")
55
+ ds = load_dataset("aimanathar/virtualtranr")
56
+
57
+ all_texts = []
58
+ for item in ds["train"]:
59
+ if "file" in item: # agar dataset me file column hai
60
+ with open(item["file"], "rb") as f:
61
+ try:
62
+ with pdfplumber.open(f) as pdf:
63
+ txt = "\n".join([page.extract_text() or "" for page in pdf.pages])
64
+ except Exception:
65
+ txt = pdf_to_text(item["file"])
66
+ if txt and len(txt.strip()) > 80:
67
+ all_texts.append(txt)
68
+
69
+ if not all_texts:
70
+ all_texts = ["Sample fallback text about Physics & Chemistry Practicals"]
71
+
72
+ print(f"✅ Extracted {len(all_texts)} documents.")
73
+
74
+ # ================== Chunk + Embeddings ==================
75
+
76
+ chunks = []
77
+ for t in all_texts:
78
+ chunks.extend(chunk_text(t, 800))
79
+
80
+ if not chunks:
81
+ print("⚠️ Warning: No chunks extracted. Using fallback text.")
82
+ chunks = [
83
+ "This is a fallback context. The dataset PDFs could not be chunked. "
84
+ "The chatbot will still run, but answers may be generic."
85
+ ]
86
+
87
+ print(f"✅ Total chunks: {len(chunks)}")
88
+
89
+ embed_model = SentenceTransformer("all-MiniLM-L6-v2")
90
+ emb = embed_model.encode(chunks, normalize_embeddings=True).astype("float32")
91
+
92
+ dim = emb.shape[1]
93
+ index = faiss.IndexFlatIP(dim)
94
+ index.add(emb)
95
+
96
+
97
+ # ================== Load Model ==================
98
+ model_id = "google/flan-t5-base"
99
+ tok = AutoTokenizer.from_pretrained(model_id)
100
+ gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
101
+ device = "cuda" if torch.cuda.is_available() else "cpu"
102
+ gen_model.to(device)
103
+
104
+ # ================== Chat Function ==================
105
+ def chat_fn(message, history=None):
106
+ context = retrieve(message)
107
+ prompt = f"Context:\n{context}\n\nQuestion:\n{message}\n\nAnswer clearly and exam-ready."
108
+ inputs = tok(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
109
+ out = gen_model.generate(**inputs, max_new_tokens=120, num_beams=4, do_sample=False)
110
+ return tok.decode(out[0], skip_special_tokens=True).strip()
111
+
112
+ # ================== Gradio Interface ==================
113
+ iface = gr.ChatInterface(
114
+ fn=chat_fn,
115
+ title="💬 Practical Chatbot",
116
+ description="Ask about Physics & Chemistry Practicals (Class 9–10)."
117
+ )
118
+
119
+ iface.launch()