aimanathar commited on
Commit
dea984d
·
verified ·
1 Parent(s): 12192cf

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -119
app.py DELETED
@@ -1,119 +0,0 @@
1
- import textwrap
2
- import warnings
3
-
4
- import faiss
5
- import gradio as gr
6
- import numpy as np
7
- import pdfplumber
8
- import pytesseract
9
- import torch
10
- from datasets import load_dataset
11
- from pdf2image import convert_from_path
12
- from pdfminer.high_level import extract_text
13
- from sentence_transformers import SentenceTransformer
14
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
15
-
16
- warnings.filterwarnings("ignore")
17
-
18
- # ================== PDF Handling Functions ==================
19
- def pdf_to_text(path):
20
- try:
21
- txt = extract_text(path) or ""
22
- except Exception:
23
- txt = ""
24
-
25
- if len(txt.strip()) < 200: # fallback OCR
26
- try:
27
- pages = convert_from_path(path, dpi=200)
28
- ocr_all = [pytesseract.image_to_string(img) for img in pages]
29
- txt = "\n".join(ocr_all)
30
- except Exception:
31
- txt = ""
32
- return txt
33
-
34
-
35
- def chunk_text(text, max_chars=800):
36
- paras = [p.strip() for p in text.split("\n") if p.strip()]
37
- chunks, buf = [], ""
38
- for p in paras:
39
- if len(p) > max_chars:
40
- for piece in textwrap.wrap(p, width=max_chars, break_long_words=False):
41
- chunks.append(piece.strip())
42
- else:
43
- if len(buf) + len(p) + 1 <= max_chars:
44
- buf = (buf + "\n" + p).strip()
45
- else:
46
- if buf:
47
- chunks.append(buf)
48
- buf = p
49
- if buf:
50
- chunks.append(buf)
51
- return [c for c in chunks if len(c) > 80]
52
-
53
- # ================== Load Dataset ==================
54
- print("📥 Loading dataset...")
55
- ds = load_dataset("aimanathar/virtualtranr")
56
-
57
- all_texts = []
58
- for item in ds["train"]:
59
- if "file" in item: # agar dataset me file column hai
60
- with open(item["file"], "rb") as f:
61
- try:
62
- with pdfplumber.open(f) as pdf:
63
- txt = "\n".join([page.extract_text() or "" for page in pdf.pages])
64
- except Exception:
65
- txt = pdf_to_text(item["file"])
66
- if txt and len(txt.strip()) > 80:
67
- all_texts.append(txt)
68
-
69
- if not all_texts:
70
- all_texts = ["Sample fallback text about Physics & Chemistry Practicals"]
71
-
72
- print(f"✅ Extracted {len(all_texts)} documents.")
73
-
74
- # ================== Chunk + Embeddings ==================
75
-
76
- chunks = []
77
- for t in all_texts:
78
- chunks.extend(chunk_text(t, 800))
79
-
80
- if not chunks:
81
- print("⚠️ Warning: No chunks extracted. Using fallback text.")
82
- chunks = [
83
- "This is a fallback context. The dataset PDFs could not be chunked. "
84
- "The chatbot will still run, but answers may be generic."
85
- ]
86
-
87
- print(f"✅ Total chunks: {len(chunks)}")
88
-
89
- embed_model = SentenceTransformer("all-MiniLM-L6-v2")
90
- emb = embed_model.encode(chunks, normalize_embeddings=True).astype("float32")
91
-
92
- dim = emb.shape[1]
93
- index = faiss.IndexFlatIP(dim)
94
- index.add(emb)
95
-
96
-
97
- # ================== Load Model ==================
98
- model_id = "google/flan-t5-base"
99
- tok = AutoTokenizer.from_pretrained(model_id)
100
- gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
101
- device = "cuda" if torch.cuda.is_available() else "cpu"
102
- gen_model.to(device)
103
-
104
- # ================== Chat Function ==================
105
- def chat_fn(message, history=None):
106
- context = retrieve(message)
107
- prompt = f"Context:\n{context}\n\nQuestion:\n{message}\n\nAnswer clearly and exam-ready."
108
- inputs = tok(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
109
- out = gen_model.generate(**inputs, max_new_tokens=120, num_beams=4, do_sample=False)
110
- return tok.decode(out[0], skip_special_tokens=True).strip()
111
-
112
- # ================== Gradio Interface ==================
113
- iface = gr.ChatInterface(
114
- fn=chat_fn,
115
- title="💬 Practical Chatbot",
116
- description="Ask about Physics & Chemistry Practicals (Class 9–10)."
117
- )
118
-
119
- iface.launch()