Muqadas-13 commited on
Commit
c9d55fe
Β·
verified Β·
1 Parent(s): 2153a88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -93
app.py CHANGED
@@ -1,97 +1,104 @@
1
- !pip install -q gradio sentence-transformers faiss-cpu pdfplumber groq
2
-
3
- import gradio as gr
4
- import pdfplumber
5
- from sentence_transformers import SentenceTransformer
6
  import faiss
7
  import numpy as np
 
8
  from groq import Groq
 
9
 
10
- # Global vars
11
- model = SentenceTransformer("all-MiniLM-L6-v2")
12
- documents, embeddings, index, text_chunks, client = [], None, None, [], None
13
-
14
- def ask_llama3(system_prompt, user_prompt):
15
- global client
16
- try:
17
- chat_completion = client.chat.completions.create(
18
- model="llama-3.1-8b-instant",
19
- messages=[
20
- {"role": "system", "content": system_prompt},
21
- {"role": "user", "content": user_prompt}
22
- ]
23
- )
24
- return chat_completion.choices[0].message.content
25
- except Exception as e:
26
- return f"❌ LLaMA3 error: {e}"
27
-
28
- def pdf_to_chunks(pdf_file, user_key):
29
- global text_chunks, embeddings, index, client
30
- try:
31
- client = Groq(api_key=user_key)
32
- except Exception as e:
33
- return f"❌ API key error: {e}"
34
-
35
- text_chunks = []
36
- try:
37
- with pdfplumber.open(pdf_file.name) as pdf:
38
- for page in pdf.pages:
39
- text = page.extract_text()
40
- if text:
41
- sentences = text.split(". ")
42
- text_chunks.extend(sentences)
43
-
44
- if not text_chunks:
45
- return "❗ No text found in PDF."
46
-
47
- embeddings = model.encode(text_chunks, convert_to_tensor=False)
48
- embeddings = np.array(embeddings).astype("float32")
49
- dimension = embeddings.shape[1]
50
- index = faiss.IndexFlatL2(dimension)
51
- index.add(embeddings)
52
-
53
- return "βœ… PDF processed and indexed successfully."
54
- except Exception as e:
55
- return f"❌ PDF processing error: {e}"
56
-
57
- def query_document(question):
58
- global index, text_chunks, model
59
- if index is None or not text_chunks:
60
- return "❗ Please upload and process a PDF first."
61
-
62
- try:
63
- q_embedding = model.encode([question])[0].astype("float32")
64
- D, I = index.search(np.array([q_embedding]), 5)
65
- retrieved_chunks = [text_chunks[i] for i in I[0]]
66
- context = "\n".join(retrieved_chunks)
67
-
68
- system_prompt = "You are a helpful study supervisor. Use the provided context to answer clearly."
69
- user_prompt = f"Context:\n{context}\n\nQuestion:\n{question}"
70
-
71
- return ask_llama3(system_prompt, user_prompt)
72
- except Exception as e:
73
- return f"❌ Query error: {e}"
74
-
75
- # UI
76
- with gr.Blocks() as demo:
77
- gr.Markdown("""
78
- <div style="text-align:center; background:#f97316; color:white; padding: 12px; border-radius: 10px;">
79
- <h2>πŸ“˜ PDF Study Assistant</h2>
80
- <p>Ask questions from your uploaded PDF using Groq + LLaMA3</p>
81
- </div>
82
- """)
83
-
84
- with gr.Column():
85
- api_input = gr.Textbox(label="πŸ”‘ Groq API Key", type="password")
86
- pdf_input = gr.File(label="πŸ“„ Upload PDF", file_types=[".pdf"])
87
- upload_btn = gr.Button("πŸ“₯ Extract & Index PDF", variant="primary")
88
- status_output = gr.Textbox(label="πŸ› οΈ Status", interactive=False)
89
-
90
- question = gr.Textbox(label="❓ Ask a Question", lines=2)
91
- get_answer_btn = gr.Button("πŸ’¬ Get Answer")
92
- answer_output = gr.Textbox(label="πŸ“’ Answer", lines=10, interactive=False)
93
-
94
- upload_btn.click(fn=pdf_to_chunks, inputs=[pdf_input, api_input], outputs=[status_output])
95
- get_answer_btn.click(fn=query_document, inputs=[question], outputs=[answer_output])
96
-
97
- demo.launch()
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from PyPDF2 import PdfReader
4
+ from docx import Document
 
5
  import faiss
6
  import numpy as np
7
+ import torch
8
  from groq import Groq
9
+ from sentence_transformers import SentenceTransformer
10
 
11
+ # βœ… Ensure model uses CPU to avoid meta tensor issue on Hugging Face
12
+ torch.set_default_device("cpu")
13
+
14
+ # βœ… Load Groq API key from environment variable
15
+ client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
16
+
17
+ # βœ… Load SentenceTransformer model safely
18
+ embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", trust_remote_code=True)
19
+
20
+ # βœ… FAISS index and chunk store
21
+ INDEX = faiss.IndexFlatL2(384)
22
+ stored_chunks = []
23
+
24
+ # βœ… UI Styling for Streamlit
25
+ st.markdown("""
26
+ <style>
27
+ .main-title {
28
+ font-size: 40px;
29
+ color: #2E86C1;
30
+ font-weight: bold;
31
+ text-align: center;
32
+ margin-bottom: 30px;
33
+ }
34
+ .card {
35
+ background-color: #ffffff;
36
+ padding: 20px;
37
+ border-radius: 15px;
38
+ box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
39
+ margin-top: 20px;
40
+ }
41
+ body {
42
+ background-color: #f8fbfd;
43
+ }
44
+ </style>
45
+ """, unsafe_allow_html=True)
46
+
47
+ st.markdown('<div class="main-title">πŸ“„ Smart RAG Document QA Assistant</div>', unsafe_allow_html=True)
48
+
49
+ # βœ… Extract text from files
50
+ def extract_text(file):
51
+ if file.type == "application/pdf":
52
+ reader = PdfReader(file)
53
+ return " ".join([page.extract_text() or "" for page in reader.pages])
54
+ elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
55
+ doc = Document(file)
56
+ return "\n".join([p.text for p in doc.paragraphs])
57
+ elif file.type.startswith("text"):
58
+ return file.read().decode("utf-8")
59
+ return ""
60
+
61
+ # βœ… Split long text into small chunks
62
+ def chunk_text(text, chunk_size=200):
63
+ words = text.split()
64
+ return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
65
+
66
+ # βœ… Store chunks and their embeddings in FAISS
67
+ def store_embeddings(chunks):
68
+ vectors = embed_model.encode(chunks)
69
+ INDEX.add(np.array(vectors, dtype=np.float32))
70
+ stored_chunks.extend(chunks)
71
+
72
+ # βœ… Retrieve most relevant chunks for a given query
73
+ def retrieve_similar_chunks(query, top_k=3):
74
+ query_vector = embed_model.encode([query])
75
+ distances, indices = INDEX.search(np.array(query_vector, dtype=np.float32), top_k)
76
+ return [stored_chunks[i] for i in indices[0]]
77
+
78
+ # βœ… Ask Groq LLM to answer based on document context
79
+ def get_llm_answer(query, context):
80
+ prompt = f"Answer the question based on the following context:\n\n{context}\n\nQuestion: {query}"
81
+ chat_completion = client.chat.completions.create(
82
+ messages=[{"role": "user", "content": prompt}],
83
+ model="llama3-70b-8192"
84
+ )
85
+ return chat_completion.choices[0].message.content
86
+
87
+ # βœ… File upload and query UI
88
+ uploaded_file = st.file_uploader("πŸ“ Upload your document", type=["pdf", "docx", "txt"])
89
+ query = st.text_input("πŸ’¬ Ask a question about your document")
90
+
91
+ if uploaded_file:
92
+ with st.spinner("Processing file..."):
93
+ text = extract_text(uploaded_file)
94
+ chunks = chunk_text(text)
95
+ store_embeddings(chunks)
96
+ st.success("βœ… Document uploaded and indexed!")
97
+
98
+ if st.button("🧠 Get Answer") and query:
99
+ with st.spinner("Thinking..."):
100
+ context = "\n\n".join(retrieve_similar_chunks(query))
101
+ answer = get_llm_answer(query, context)
102
+ st.markdown(f'<div class="card"><b>Answer:</b><br>{answer}</div>', unsafe_allow_html=True)
103
+
104
+ st.markdown("<br><center style='color: grey;'>Built by Muqadas with ❀️ using Streamlit + Groq + FAISS</center>", unsafe_allow_html=True)