aimanathar commited on
Commit
0e54eef
·
verified ·
1 Parent(s): ee62456

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import textwrap
2
+ import warnings
3
+
4
+ import faiss
5
+ import gradio as gr
6
+ import numpy as np
7
+ import pytesseract
8
+ import torch
9
+ from datasets import load_dataset # ✅ Hugging Face Dataset
10
+ from pdf2image import convert_from_path
11
+ from pdfminer.high_level import extract_text
12
+ from sentence_transformers import SentenceTransformer
13
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
14
+
15
+ warnings.filterwarnings("ignore")
16
+
17
+ # ================== PDF Handling ==================
18
+ def pdf_to_text(path):
19
+ try:
20
+ txt = extract_text(path) or ""
21
+ except Exception:
22
+ txt = ""
23
+ if len(txt.strip()) < 200:
24
+ try:
25
+ pages = convert_from_path(path, dpi=200)
26
+ ocr_all = [pytesseract.image_to_string(img) for img in pages]
27
+ txt = "\n".join(ocr_all)
28
+ except Exception:
29
+ txt = ""
30
+ return txt
31
+
32
+
33
+ def chunk_text(text, max_chars=800):
34
+ paras = [p.strip() for p in text.split("\n") if p.strip()]
35
+ chunks, buf = [], ""
36
+ for p in paras:
37
+ if len(p) > max_chars:
38
+ for piece in textwrap.wrap(p, width=max_chars, break_long_words=False):
39
+ chunks.append(piece.strip())
40
+ else:
41
+ if len(buf) + len(p) + 1 <= max_chars:
42
+ buf = (buf + "\n" + p).strip()
43
+ else:
44
+ if buf:
45
+ chunks.append(buf)
46
+ buf = p
47
+ if buf:
48
+ chunks.append(buf)
49
+ return [c for c in chunks if len(c) > 80]
50
+
51
+
52
+ # ================== Load Dataset from Hugging Face ==================
53
+ print("⬇️ Loading Hugging Face dataset...")
54
+ ds = load_dataset("aimanathar/virtualtranr") # replace with your dataset name
55
+ dataset_texts = []
56
+
57
+ # assume dataset has a "text" column
58
+ for item in ds["train"]:
59
+ if "text" in item and item["text"].strip():
60
+ dataset_texts.append(item["text"].strip())
61
+
62
+ print(f"✅ Loaded {len(dataset_texts)} rows from dataset.")
63
+
64
+
65
+ # ================== Make Chunks ==================
66
+ all_chunks = []
67
+ for t in dataset_texts:
68
+ all_chunks.extend(chunk_text(t))
69
+
70
+ print(f"✅ Created {len(all_chunks)} chunks from dataset.")
71
+
72
+
73
+ # ================== Embeddings + FAISS ==================
74
+ embed_model = SentenceTransformer("all-MiniLM-L6-v2")
75
+ emb = embed_model.encode(all_chunks, normalize_embeddings=True).astype("float32")
76
+ dim = emb.shape[1]
77
+ index = faiss.IndexFlatIP(dim)
78
+ index.add(emb)
79
+
80
+
81
+ def retrieve(query, k=5):
82
+ q = embed_model.encode([query], normalize_embeddings=True).astype("float32")
83
+ D, I = index.search(q, k)
84
+ ctx = "\n\n".join(all_chunks[i] for i in I[0])
85
+ return ctx
86
+
87
+
88
+ # ================== Model ==================
89
+ model_id = "google/flan-t5-base"
90
+ tok = AutoTokenizer.from_pretrained(model_id)
91
+ gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
92
+ device = "cuda" if torch.cuda.is_available() else "cpu"
93
+ gen_model.to(device)
94
+
95
+
96
+ # ================== Chat Function ==================
97
+ def chat_fn(message, history=None):
98
+ context = retrieve(message, k=5) # dataset se context
99
+ prompt = f"Context:\n{context}\n\nQuestion:\n{message}\n\nAnswer clearly and exam-ready."
100
+ inputs = tok(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
101
+ out = gen_model.generate(**inputs, max_new_tokens=120, num_beams=4, do_sample=False)
102
+ return tok.decode(out[0], skip_special_tokens=True).strip()
103
+
104
+
105
+ # ================== Gradio Interface ==================
106
+ iface = gr.ChatInterface(
107
+ fn=chat_fn,
108
+ title="💬 Practical Chatbot",
109
+ description="Ask about Physics & Chemistry Practicals (Class 9–10). Powered by Hugging Face dataset + PDFs."
110
+ )
111
+
112
+ iface.launch()