Mahrukhh commited on
Commit
02dfc5a
Β·
verified Β·
1 Parent(s): dd0aec1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -0
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import faiss
3
+ from PyPDF2 import PdfReader
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from groq import Groq
7
+ import os
8
+
9
+ # πŸ—οΈ Use secret in Hugging Face Spaces
10
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
11
+
12
+ if not GROQ_API_KEY:
13
+ st.error("❌ GROQ_API_KEY not found. Please add it in the Hugging Face Space secrets.")
14
+ st.stop()
15
+
16
+ client = Groq(api_key=GROQ_API_KEY)
17
+ embedder = SentenceTransformer('all-MiniLM-L6-v2')
18
+
19
+ # --- Helper Functions ---
20
+ def extract_text_from_pdf(uploaded_file):
21
+ reader = PdfReader(uploaded_file)
22
+ return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
23
+
24
+ def chunk_text(text, chunk_size=500, overlap=100):
25
+ words = text.split()
26
+ return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size - overlap)]
27
+
28
+ def vectorize_chunks(chunks):
29
+ return embedder.encode(chunks)
30
+
31
+ def store_embeddings(vectors):
32
+ dim = vectors.shape[1]
33
+ index = faiss.IndexFlatL2(dim)
34
+ index.add(vectors)
35
+ return index
36
+
37
+ def get_relevant_chunk(query, chunks, embeddings):
38
+ query_vec = embedder.encode([query])
39
+ scores = cosine_similarity(query_vec, embeddings)[0]
40
+ return chunks[scores.argmax()]
41
+
42
+ # --- Streamlit UI ---
43
+ st.set_page_config(page_title="RAG PDF Q&A with Groq", layout="wide")
44
+ st.title("πŸ“„ Ask Questions from Your PDF")
45
+
46
+ uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
47
+
48
+ if uploaded_file:
49
+ text = extract_text_from_pdf(uploaded_file)
50
+ chunks = chunk_text(text)
51
+ embeddings = vectorize_chunks(chunks)
52
+ index = store_embeddings(embeddings)
53
+
54
+ st.success("βœ… PDF processed successfully!")
55
+
56
+ user_query = st.text_input("πŸ’¬ Ask a question:")
57
+ if user_query:
58
+ relevant = get_relevant_chunk(user_query, chunks, embeddings)
59
+
60
+ response = client.chat.completions.create(
61
+ model="llama3-8b-8192",
62
+ messages=[
63
+ {
64
+ "role": "user",
65
+ "content": f"Use this context to answer:\n\n{relevant}\n\nQuestion: {user_query}"
66
+ }
67
+ ],
68
+ )
69
+ st.markdown("### βœ… Answer")
70
+ st.write(response.choices[0].message.content)