aimanathar commited on
Commit
da2d95f
·
verified ·
1 Parent(s): 0cdf6e1

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import textwrap
2
+ import warnings
3
+
4
+ import faiss
5
+ import numpy as np
6
+ import torch
7
+
8
+ warnings.filterwarnings("ignore")
9
+ import gradio as gr
10
+ import pytesseract
11
+ from pdf2image import convert_from_path
12
+ from pdfminer.high_level import extract_text
13
+ from sentence_transformers import SentenceTransformer
14
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
15
+
16
+
17
+ # ================== PDF Handling Functions ==================
18
+ def pdf_to_text(path):
19
+ try:
20
+ txt = extract_text(path) or ""
21
+ except Exception:
22
+ txt = ""
23
+ if len(txt.strip()) < 200:
24
+ try:
25
+ pages = convert_from_path(path, dpi=200)
26
+ ocr_all = [pytesseract.image_to_string(img) for img in pages]
27
+ txt = "\n".join(ocr_all)
28
+ except Exception:
29
+ txt = ""
30
+ return txt
31
+
32
+ def chunk_text(text, max_chars=800):
33
+ paras = [p.strip() for p in text.split("\n") if p.strip()]
34
+ chunks, buf = [], ""
35
+ for p in paras:
36
+ if len(p) > max_chars:
37
+ for piece in textwrap.wrap(p, width=max_chars, break_long_words=False):
38
+ chunks.append(piece.strip())
39
+ else:
40
+ if len(buf) + len(p) + 1 <= max_chars:
41
+ buf = (buf + "\n" + p).strip()
42
+ else:
43
+ if buf: chunks.append(buf)
44
+ buf = p
45
+ if buf: chunks.append(buf)
46
+ return [c for c in chunks if len(c) > 80]
47
+
48
+ # ================== Load Embeddings + Model ==================
49
+ embed_model = SentenceTransformer("all-MiniLM-L6-v2")
50
+
51
+ model_id = "google/flan-t5-base"
52
+ tok = AutoTokenizer.from_pretrained(model_id)
53
+ gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
54
+ device = "cuda" if torch.cuda.is_available() else "cpu"
55
+ gen_model.to(device)
56
+
57
+ # ================== Chat Function ==================
58
+ def chat_fn(message, history=None):
59
+ prompt = f"Answer clearly and exam-ready:\n\nQuestion:\n{message}"
60
+ inputs = tok(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
61
+ out = gen_model.generate(**inputs, max_new_tokens=120, num_beams=4, do_sample=False)
62
+ return tok.decode(out[0], skip_special_tokens=True).strip()
63
+
64
+ # ================== Gradio Interface ==================
65
+ iface = gr.ChatInterface(
66
+ fn=chat_fn,
67
+ title="💬 Practical Chatbot",
68
+ description="Ask about Physics & Chemistry Practicals (Class 9–10)."
69
+ )
70
+
71
+ iface.launch()