Rishitha3 commited on
Commit
3dda9b8
·
verified ·
1 Parent(s): 4f396da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -141
app.py CHANGED
@@ -1,147 +1,92 @@
1
- import streamlit as st
2
- import fitz # PyMuPDF
3
- import re
4
- from sentence_transformers import SentenceTransformer
5
- import numpy as np
6
  import faiss
7
- from transformers import AutoTokenizer, AutoModelForCausalLM
8
  import os
9
- from huggingface_hub import login # ✅ added for token auth
 
 
 
 
 
10
 
11
- # -----------------------------
12
- # PDF Text Loader
13
- # -----------------------------
14
- def load_pdf_text(uploaded_file):
15
- doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
16
  text = ""
17
- for page in doc:
18
- text += page.get_text()
19
- if not text.strip():
20
- raise ValueError("No text found in PDF.")
 
 
 
 
 
 
21
  return text
22
 
23
- # -----------------------------
24
- # Chunk Text
25
- # -----------------------------
26
- def chunk_text(text, max_tokens=200):
27
- sentences = re.split(r'(?<=[.!?]) +', text)
28
- chunks, current_chunk = [], []
29
- current_len = 0
30
- for sentence in sentences:
31
- word_count = len(sentence.split())
32
- if current_len + word_count > max_tokens:
33
- chunks.append(" ".join(current_chunk))
34
- current_chunk = [sentence]
35
- current_len = word_count
36
- else:
37
- current_chunk.append(sentence)
38
- current_len += word_count
39
- if current_chunk:
40
- chunks.append(" ".join(current_chunk))
41
- return chunks
42
-
43
- # -----------------------------
44
- # Simple Vector Store
45
- # -----------------------------
46
- class SimpleVectorStore:
47
- def __init__(self, dim):
48
- self.dim = dim
49
- self.vectors = []
50
- self.metadata = []
51
- self.index = None
52
-
53
- def add(self, vectors, metas):
54
- for v, m in zip(vectors, metas):
55
- vec = np.array(v, dtype=np.float32)
56
- self.vectors.append(vec)
57
- self.metadata.append(m)
58
- if self.vectors:
59
- self.index = faiss.IndexFlatL2(self.dim)
60
- self.index.add(np.stack(self.vectors))
61
-
62
- def search(self, query_vector, k=5):
63
- query_vector = np.array(query_vector, dtype=np.float32).reshape(1, -1)
64
- D, I = self.index.search(query_vector, k)
65
- results = [self.metadata[i] for i in I[0]]
66
- return results
67
-
68
- # -----------------------------
69
- # Index PDF
70
- # -----------------------------
71
- def index_pdf(uploaded_file):
72
- text = load_pdf_text(uploaded_file)
73
- chunks = chunk_text(text)
74
- embed_model = SentenceTransformer("all-MiniLM-L6-v2")
75
- vectors = embed_model.encode(chunks)
76
- store = SimpleVectorStore(dim=vectors.shape[1])
77
- store.add(vectors, chunks)
78
- return embed_model, store, chunks
79
-
80
- # -----------------------------
81
- # Load LLaMA Model
82
- # -----------------------------
83
- @st.cache_resource
84
- def load_llm():
85
- model_id = "meta-llama/Llama-3.2-3b-instruct"
86
-
87
- # ✅ Get token from HF secrets
88
- hf_token = os.getenv("HF_TOKEN")
89
- if not hf_token:
90
- raise ValueError("HF_TOKEN is not set. Please add it in Hugging Face Secrets.")
91
-
92
- login(hf_token) # ✅ Authenticate with HF Hub
93
-
94
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
95
- llm = AutoModelForCausalLM.from_pretrained(
96
- model_id,
97
- device_map="auto",
98
- torch_dtype="auto",
99
- token=hf_token # ✅ Needed to load gated model
100
- )
101
- return tokenizer, llm
102
-
103
- # -----------------------------
104
- # Streamlit UI
105
- # -----------------------------
106
- st.set_page_config(page_title="Student Assisted Chatbot", page_icon="🤖", layout="wide")
107
- st.title("🎓 Student Assisted Chatbot")
108
- st.write("Upload your textbook (PDF) and ask questions about it.")
109
-
110
- uploaded_file = st.file_uploader("Upload PDF", type="pdf")
111
- user_input = st.text_input("Your question:")
112
-
113
- if uploaded_file and user_input:
114
- try:
115
- embed_model, store, chunks = index_pdf(uploaded_file)
116
- tokenizer, llm = load_llm()
117
-
118
- query_vec = embed_model.encode([user_input])[0]
119
- relevant_chunks = store.search(query_vec, k=5)
120
- context = "\n".join(relevant_chunks)
121
-
122
- prompt = f"""
123
- [INST] You are a helpful tutor. Based only on the context below, answer the question in complete sentences.
124
- If the context does not contain enough information, say "I could not find this in the text."
125
- Context:
126
- {context}
127
- Question: {user_input}
128
- Answer: [/INST]
129
- """
130
-
131
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(llm.device)
132
- outputs = llm.generate(
133
- **inputs,
134
- max_new_tokens=300,
135
- temperature=0.7,
136
- top_p=0.9,
137
- do_sample=True
138
- )
139
-
140
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
141
- if "Answer:" in answer:
142
- answer = answer.split("Answer:")[-1].strip()
143
-
144
- st.write("🧠 Answer")
145
- st.write(answer if answer else "Sorry, I couldn’t generate a complete answer.")
146
- except Exception as e:
147
- st.error(f"Error: {e}")
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF for PDFs
3
+ import docx
 
 
4
  import faiss
 
5
  import os
6
+ from sentence_transformers import SentenceTransformer
7
+ from transformers import pipeline
8
+
9
+ # 1. Load embedding + QA model
10
+ embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
11
+ qa_model = pipeline("text-generation", model="gpt2") # Replace with better model if GPU available
12
 
13
+ # 2. Helper: extract text from files
14
+ def extract_text(file):
 
 
 
15
  text = ""
16
+ if file.name.endswith(".pdf"):
17
+ doc = fitz.open(file.name)
18
+ for page in doc:
19
+ text += page.get_text("text")
20
+ elif file.name.endswith(".docx"):
21
+ doc = docx.Document(file.name)
22
+ for para in doc.paragraphs:
23
+ text += para.text + "\n"
24
+ else: # fallback: txt
25
+ text = file.read().decode("utf-8", errors="ignore")
26
  return text
27
 
28
+ # 3. Helper: create FAISS index
29
+ def build_faiss(text, chunk_size=500, overlap=50):
30
+ # Split text into chunks
31
+ chunks = []
32
+ for i in range(0, len(text), chunk_size - overlap):
33
+ chunks.append(text[i:i + chunk_size])
34
+
35
+ # Embed chunks
36
+ embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
37
+
38
+ # Store in FAISS
39
+ index = faiss.IndexFlatL2(embeddings.shape[1])
40
+ index.add(embeddings)
41
+
42
+ return index, chunks
43
+
44
+ # Global storage
45
+ doc_index = None
46
+ doc_chunks = None
47
+
48
+ # 4. Process uploaded file
49
+ def upload_file(file):
50
+ global doc_index, doc_chunks
51
+ text = extract_text(file)
52
+ doc_index, doc_chunks = build_faiss(text)
53
+ return "✅ Document indexed! You can now ask questions."
54
+
55
+ # 5. Answer questions
56
+ def answer_query(query):
57
+ global doc_index, doc_chunks
58
+ if doc_index is None:
59
+ return "⚠️ Please upload a document first."
60
+
61
+ # Embed query
62
+ q_emb = embedding_model.encode([query], convert_to_numpy=True)
63
+
64
+ # Retrieve top 3
65
+ D, I = doc_index.search(q_emb, k=3)
66
+ retrieved = [doc_chunks[i] for i in I[0]]
67
+
68
+ # Build prompt
69
+ context = "\n\n".join(retrieved)
70
+ prompt = f"Answer the question based on the context:\n\nContext: {context}\n\nQuestion: {query}\nAnswer:"
71
+
72
+ # Generate
73
+ response = qa_model(prompt, max_length=200, num_return_sequences=1)[0]["generated_text"]
74
+ return response
75
+
76
+ # 6. Gradio UI
77
+ with gr.Blocks() as demo:
78
+ gr.Markdown("## 📚 Chat with Any Document (RAG Demo)")
79
+
80
+ with gr.Row():
81
+ file_input = gr.File(label="Upload Document", type="filepath")
82
+ upload_btn = gr.Button("Index Document")
83
+
84
+ status = gr.Textbox(label="Status")
85
+ query = gr.Textbox(label="Ask a Question")
86
+ answer = gr.Textbox(label="Answer")
87
+ ask_btn = gr.Button("Get Answer")
88
+
89
+ upload_btn.click(upload_file, inputs=file_input, outputs=status)
90
+ ask_btn.click(answer_query, inputs=query, outputs=answer)
91
+
92
+ demo.launch()