anamjafar6 commited on
Commit
9163f79
Β·
verified Β·
1 Parent(s): ae42f0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -85
app.py CHANGED
@@ -1,96 +1,178 @@
1
  import streamlit as st
2
- from pypdf import PdfReader
3
- from sentence_transformers import SentenceTransformer
4
  import chromadb
5
- from chromadb.utils import embedding_functions
6
  from groq import Groq
7
- import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # -------------------------------
10
- # 1. Setup
11
- # -------------------------------
12
- st.set_page_config(page_title="πŸ“– RAG Tutor", layout="wide")
13
- st.title("πŸ“š RAG Tutor – Learn from Your Book")
14
-
15
- # Load API key from Hugging Face secrets
16
- api_key = os.environ.get("GROQ_API_KEY")
17
- if not api_key:
18
- st.error("❌ Missing GROQ_API_KEY. Please add it in Hugging Face Secrets.")
19
- st.stop()
20
-
21
- client = Groq(api_key=api_key)
22
-
23
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
24
-
25
- # Create ChromaDB in-memory instance
26
- chroma_client = chromadb.Client()
27
- collection = chroma_client.create_collection(
28
- name="book_chunks",
29
- embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
30
- model_name="all-MiniLM-L6-v2"
31
- )
32
- )
33
-
34
- # -------------------------------
35
- # 2. PDF Upload + Processing
36
- # -------------------------------
37
- uploaded_file = st.file_uploader("πŸ“‚ Upload a PDF book", type=["pdf"])
38
-
39
- if uploaded_file:
40
- reader = PdfReader(uploaded_file)
41
- text = ""
42
- for i, page in enumerate(reader.pages):
43
- page_text = page.extract_text()
44
- if page_text:
45
- text += f"[Page {i+1}]\n" + page_text + "\n"
46
-
47
- # Split into ~300 word chunks
48
- words = text.split()
49
- chunks = [" ".join(words[i:i+300]) for i in range(0, len(words), 300)]
50
-
51
- # Store chunks in ChromaDB
52
- for idx, chunk in enumerate(chunks):
53
  collection.add(
54
- documents=[chunk],
55
- metadatas=[{"source": f"Page {idx//1+1}"}],
56
- ids=[str(idx)]
 
 
 
 
 
57
  )
58
- st.success("βœ… PDF processed and stored in memory!")
59
-
60
- # -------------------------------
61
- # 3. Ask Questions
62
- # -------------------------------
63
- question = st.text_input("❓ Ask a question about the book")
64
-
65
- if st.button("Get Answer") and question:
66
- q_embedding = embedder.encode(question).tolist()
67
-
68
- results = collection.query(
69
- query_embeddings=[q_embedding],
70
- n_results=3
71
- )
72
-
73
- if results["documents"][0]:
74
- context = "\n\n".join(results["documents"][0])
75
- sources = [m["source"] for m in results["metadatas"][0]]
76
-
77
- # Prompt LLM with context
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  prompt = f"""
79
- You are a tutor limited to the given book excerpts.
80
- Answer ONLY from the book. Always provide [Page].
81
- If not enough info, say: ❌ Insufficient evidence.
 
82
 
83
- Context:
84
- {context}
85
 
86
- Question: {question}
87
- Answer:
88
- """
89
- response = client.chat.completions.create(
 
 
90
  model="llama3-8b-8192",
91
- messages=[{"role": "user", "content": prompt}]
 
 
 
 
 
92
  )
93
- st.write(response.choices[0].message.content)
94
- st.caption(f"πŸ“‘ Sources: {', '.join(sources)}")
95
- else:
96
- st.error("❌ Insufficient evidence.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import os
3
+ import pypdf
4
  import chromadb
5
+ from sentence_transformers import SentenceTransformer
6
  from groq import Groq
7
+ from typing import List, Dict, Any, Optional
8
+
9
+ # CONFIG
10
+ SIMILARITY_THRESHOLD = 0.2
11
+ TOP_K_CHUNKS = 3
12
+ CHUNK_SIZE = 300
13
+ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
14
+
15
+ # PDF extraction
16
+ def extract_text_from_pdf(pdf_file) -> Dict[str, Any]:
17
+ try:
18
+ pdf_reader = pypdf.PdfReader(pdf_file)
19
+ pages_text = []
20
+ for page_num, page in enumerate(pdf_reader.pages):
21
+ page_text = page.extract_text()
22
+ if page_text and page_text.strip():
23
+ pages_text.append({
24
+ 'page_number': page_num + 1,
25
+ 'text': page_text.strip()
26
+ })
27
+ return {"success": True, "pages": pages_text, "total_pages": len(pages_text)}
28
+ except Exception as e:
29
+ return {"success": False, "error": str(e)}
30
+
31
+ # Chunking
32
+ def create_chunks(pages_text: List[Dict]) -> List[Dict]:
33
+ chunks = []
34
+ chunk_id = 0
35
+ for page_data in pages_text:
36
+ words = page_data['text'].split()
37
+ for i in range(0, len(words), CHUNK_SIZE):
38
+ chunk_words = words[i:i + CHUNK_SIZE]
39
+ if len(chunk_words) > 20:
40
+ chunks.append({
41
+ "id": chunk_id,
42
+ "text": " ".join(chunk_words),
43
+ "page_number": page_data['page_number'],
44
+ "word_count": len(chunk_words)
45
+ })
46
+ chunk_id += 1
47
+ return chunks
48
+
49
+ # Embedding model
50
+ @st.cache_resource
51
+ def load_embedding_model():
52
+ return SentenceTransformer(EMBEDDING_MODEL)
53
+
54
+ # Vector database
55
+ def create_vector_database(chunks: List[Dict], embedding_model) -> Optional[Any]:
56
+ try:
57
+ client = chromadb.Client()
58
+ # use get_or_create instead of create
59
+ collection = client.get_or_create_collection("pdf_chunks")
60
+
61
+ texts = [c['text'] for c in chunks]
62
+ embeddings = embedding_model.encode(texts).tolist()
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  collection.add(
65
+ embeddings=embeddings,
66
+ documents=texts,
67
+ metadatas=[{
68
+ "page_number": c["page_number"],
69
+ "chunk_id": c["id"],
70
+ "word_count": c["word_count"]
71
+ } for c in chunks],
72
+ ids=[str(c["id"]) for c in chunks]
73
  )
74
+ return collection
75
+ except Exception as e:
76
+ st.error(f"Vector DB error: {e}")
77
+ return None
78
+
79
+ def query_vector_database(collection, query: str, embedding_model, k: int = TOP_K_CHUNKS) -> List[Dict]:
80
+ try:
81
+ query_emb = embedding_model.encode([query]).tolist()
82
+ results = collection.query(query_embeddings=query_emb, n_results=k)
83
+ relevant_chunks = []
84
+ for i in range(len(results['documents'][0])):
85
+ distance = results['distances'][0][i]
86
+ similarity = max(0, 1 - distance)
87
+ if similarity >= SIMILARITY_THRESHOLD:
88
+ relevant_chunks.append({
89
+ "text": results['documents'][0][i],
90
+ "page_number": results['metadatas'][0][i]["page_number"],
91
+ "similarity": similarity,
92
+ "chunk_id": results['metadatas'][0][i]["chunk_id"]
93
+ })
94
+ return relevant_chunks
95
+ except Exception as e:
96
+ st.error(f"Query error: {e}")
97
+ return []
98
+
99
+ # Groq setup
100
+ def setup_groq():
101
+ api_key = st.secrets.get("GROQ_API_KEY") or os.getenv("GROQ_API_KEY")
102
+ if not api_key:
103
+ st.error("❌ No GROQ_API_KEY found. Please add it to secrets or env.")
104
+ return None
105
+ return Groq(api_key=api_key)
106
+
107
+ def generate_answer_with_groq(client, query: str, relevant_chunks: List[Dict]) -> str:
108
+ try:
109
+ context = "\n\n".join([f"[Page {c['page_number']}]: {c['text']}" for c in relevant_chunks])
110
  prompt = f"""
111
+ Based ONLY on the following context from a PDF document, answer the user's question.
112
+
113
+ Context:
114
+ {context}
115
 
116
+ Question: {query}
 
117
 
118
+ Instructions:
119
+ - Answer ONLY using info from the context above
120
+ - If not enough info, reply: ❌ Insufficient evidence
121
+ - Always include page citations like [Page X]
122
+ """
123
+ chat = client.chat.completions.create(
124
  model="llama3-8b-8192",
125
+ messages=[
126
+ {"role": "system", "content": "You are a helpful tutor AI."},
127
+ {"role": "user", "content": prompt}
128
+ ],
129
+ temperature=0.1,
130
+ max_tokens=500
131
  )
132
+ return chat.choices[0].message.content
133
+ except Exception as e:
134
+ return f"Error generating answer: {e}"
135
+
136
+ # Main answer pipeline
137
+ def generate_answer(query: str, relevant_chunks: List[Dict]) -> str:
138
+ if not relevant_chunks:
139
+ return "❌ Insufficient evidence"
140
+ client = setup_groq()
141
+ if client:
142
+ return generate_answer_with_groq(client, query, relevant_chunks)
143
+ return "❌ No LLM configured."
144
+
145
+ # -----------------------------
146
+ # STREAMLIT MAIN
147
+ # -----------------------------
148
+ def main():
149
+ st.set_page_config(page_title="PageMentor", layout="wide")
150
+
151
+ st.title("πŸ“š PageMentor")
152
+
153
+ if "vector_db" not in st.session_state:
154
+ st.session_state.vector_db = None
155
+ st.session_state.embedding_model = load_embedding_model()
156
+
157
+ uploaded_file = st.file_uploader("Upload PDF", type="pdf")
158
+
159
+ if uploaded_file and st.button("πŸš€ Process PDF"):
160
+ pdf_result = extract_text_from_pdf(uploaded_file)
161
+ if pdf_result["success"]:
162
+ chunks = create_chunks(pdf_result["pages"])
163
+ st.session_state.vector_db = create_vector_database(chunks, st.session_state.embedding_model)
164
+ if st.session_state.vector_db:
165
+ st.success(f"βœ… Processed {pdf_result['total_pages']} pages, {len(chunks)} chunks ready!")
166
+ else:
167
+ st.error(pdf_result["error"])
168
+
169
+ if st.session_state.vector_db:
170
+ query = st.text_input("Ask a question:")
171
+ if query and st.button("πŸ” Get Answer"):
172
+ relevant_chunks = query_vector_database(st.session_state.vector_db, query, st.session_state.embedding_model)
173
+ answer = generate_answer(query, relevant_chunks)
174
+ st.markdown("### 🎯 Answer")
175
+ st.write(answer)
176
+
177
+ if __name__ == "__main__":
178
+ main()