aimanathar commited on
Commit
c29482e
·
verified ·
1 Parent(s): b007a3e

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -112
app.py DELETED
@@ -1,112 +0,0 @@
1
- import textwrap
2
- import warnings
3
-
4
- import faiss
5
- import gradio as gr
6
- import numpy as np
7
- import pytesseract
8
- import torch
9
- from datasets import load_dataset # ✅ Hugging Face Dataset
10
- from pdf2image import convert_from_path
11
- from pdfminer.high_level import extract_text
12
- from sentence_transformers import SentenceTransformer
13
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
14
-
15
- warnings.filterwarnings("ignore")
16
-
17
- # ================== PDF Handling ==================
18
- def pdf_to_text(path):
19
- try:
20
- txt = extract_text(path) or ""
21
- except Exception:
22
- txt = ""
23
- if len(txt.strip()) < 200:
24
- try:
25
- pages = convert_from_path(path, dpi=200)
26
- ocr_all = [pytesseract.image_to_string(img) for img in pages]
27
- txt = "\n".join(ocr_all)
28
- except Exception:
29
- txt = ""
30
- return txt
31
-
32
-
33
- def chunk_text(text, max_chars=800):
34
- paras = [p.strip() for p in text.split("\n") if p.strip()]
35
- chunks, buf = [], ""
36
- for p in paras:
37
- if len(p) > max_chars:
38
- for piece in textwrap.wrap(p, width=max_chars, break_long_words=False):
39
- chunks.append(piece.strip())
40
- else:
41
- if len(buf) + len(p) + 1 <= max_chars:
42
- buf = (buf + "\n" + p).strip()
43
- else:
44
- if buf:
45
- chunks.append(buf)
46
- buf = p
47
- if buf:
48
- chunks.append(buf)
49
- return [c for c in chunks if len(c) > 80]
50
-
51
-
52
- # ================== Load Dataset from Hugging Face ==================
53
- print("⬇️ Loading Hugging Face dataset...")
54
- ds = load_dataset("aimanathar/virtualtranr") # replace with your dataset name
55
- dataset_texts = []
56
-
57
- # assume dataset has a "text" column
58
- for item in ds["train"]:
59
- if "text" in item and item["text"].strip():
60
- dataset_texts.append(item["text"].strip())
61
-
62
- print(f"✅ Loaded {len(dataset_texts)} rows from dataset.")
63
-
64
-
65
- # ================== Make Chunks ==================
66
- all_chunks = []
67
- for t in dataset_texts:
68
- all_chunks.extend(chunk_text(t))
69
-
70
- print(f"✅ Created {len(all_chunks)} chunks from dataset.")
71
-
72
-
73
- # ================== Embeddings + FAISS ==================
74
- embed_model = SentenceTransformer("all-MiniLM-L6-v2")
75
- emb = embed_model.encode(all_chunks, normalize_embeddings=True).astype("float32")
76
- dim = emb.shape[1]
77
- index = faiss.IndexFlatIP(dim)
78
- index.add(emb)
79
-
80
-
81
- def retrieve(query, k=5):
82
- q = embed_model.encode([query], normalize_embeddings=True).astype("float32")
83
- D, I = index.search(q, k)
84
- ctx = "\n\n".join(all_chunks[i] for i in I[0])
85
- return ctx
86
-
87
-
88
- # ================== Model ==================
89
- model_id = "google/flan-t5-base"
90
- tok = AutoTokenizer.from_pretrained(model_id)
91
- gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
92
- device = "cuda" if torch.cuda.is_available() else "cpu"
93
- gen_model.to(device)
94
-
95
-
96
- # ================== Chat Function ==================
97
- def chat_fn(message, history=None):
98
- context = retrieve(message, k=5) # dataset se context
99
- prompt = f"Context:\n{context}\n\nQuestion:\n{message}\n\nAnswer clearly and exam-ready."
100
- inputs = tok(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
101
- out = gen_model.generate(**inputs, max_new_tokens=120, num_beams=4, do_sample=False)
102
- return tok.decode(out[0], skip_special_tokens=True).strip()
103
-
104
-
105
- # ================== Gradio Interface ==================
106
- iface = gr.ChatInterface(
107
- fn=chat_fn,
108
- title="💬 Practical Chatbot",
109
- description="Ask about Physics & Chemistry Practicals (Class 9–10). Powered by Hugging Face dataset + PDFs."
110
- )
111
-
112
- iface.launch()