Spaces:
Sleeping
Sleeping
File size: 15,330 Bytes
2bf8221 249f024 2bf8221 aeb29d9 2bf8221 f4bbb80 2bf8221 4a88a29 2bf8221 372ddca 2bf8221 f4bbb80 2bf8221 f4bbb80 2bf8221 f4bbb80 2bf8221 f4bbb80 2bf8221 f4bbb80 2bf8221 f4bbb80 2bf8221 f4bbb80 2bf8221 48b7c01 2bf8221 f4bbb80 2bf8221 f4bbb80 2bf8221 f4bbb80 48b7c01 f4bbb80 2bf8221 f4bbb80 2bf8221 f4bbb80 2bf8221 f4bbb80 2bf8221 f4bbb80 2bf8221 f4bbb80 2bf8221 26938a8 2bf8221 f4bbb80 2bf8221 49daf6f 2bf8221 933d0b2 372ddca 2bf8221 933d0b2 2bf8221 dc75fd3 2bf8221 933d0b2 2bf8221 933d0b2 2bf8221 933d0b2 2bf8221 372ddca 933d0b2 372ddca 933d0b2 2bf8221 372ddca 933d0b2 736fff1 2bf8221 55574e7 2bf8221 372ddca 2bf8221 7137d82 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 |
import os
import fitz # PyMuPDF
import streamlit as st
import tempfile
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import tiktoken
import requests
from deep_translator import GoogleTranslator
from gtts import gTTS
import time
st.set_page_config(
page_title="RAG Document Assistant",
page_icon="๐",
layout="wide",
initial_sidebar_state="expanded"
)
def sidebar_profiles():
st.sidebar.markdown("""<hr>""", unsafe_allow_html=True)
st.sidebar.markdown("### ๐Author: Maria Nadeem๐")
st.sidebar.markdown("### ๐ Connect With Me")
st.sidebar.markdown("""
<hr>
<div class="profile-links">
<a href="https://github.com/marianadeem755" target="_blank">
<img src="https://cdn-icons-png.flaticon.com/512/25/25231.png" width="20px"> GitHub
</a><br><br>
<a href="https://www.kaggle.com/marianadeem755" target="_blank">
<img src="https://cdn4.iconfinder.com/data/icons/logos-and-brands/512/189_Kaggle_logo_logos-512.png" width="20px"> Kaggle
</a><br><br>
<a href="mailto:marianadeem755@gmail.com">
<img src="https://cdn-icons-png.flaticon.com/512/561/561127.png" width="20px"> Email
</a><br><br>
<a href="https://huggingface.co/maria355" target="_blank">
<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" width="20px"> Hugging Face
</a>
</div>
<hr>
""", unsafe_allow_html=True)
# Add the profile section
sidebar_profiles()
def get_api_key():
api_key = os.getenv("GROQ_API_KEY")
if not api_key:
st.error("GROQ_API_KEY environment variable is not set. Please set it before running the application.")
return api_key
# Session state initialization
for key, default in {
"chunks": [],
"chunk_sources": [],
"debug_mode": False,
"last_query_time": None,
"last_response": None
}.items():
if key not in st.session_state:
st.session_state[key] = default
@st.cache_resource
def load_embedder():
return SentenceTransformer("all-MiniLM-L6-v2")
embedder = load_embedder()
embedding_dim = 384
index = faiss.IndexFlatL2(embedding_dim)
tokenizer = tiktoken.get_encoding("cl100k_base")
def num_tokens_from_string(string: str) -> int:
return len(tokenizer.encode(string))
def chunk_text(text, max_tokens=250):
sentences = text.split(". ")
current_chunk = []
total_tokens = 0
result_chunks = []
for sentence in sentences:
if not sentence.strip():
continue
token_len = num_tokens_from_string(sentence)
if total_tokens + token_len > max_tokens:
if current_chunk:
result_chunks.append(". ".join(current_chunk) + ("." if not current_chunk[-1].endswith(".") else ""))
current_chunk = [sentence]
total_tokens = token_len
else:
current_chunk.append(sentence)
total_tokens += token_len
if current_chunk:
result_chunks.append(". ".join(current_chunk) + ("." if not current_chunk[-1].endswith(".") else ""))
return result_chunks
def extract_text_from_pdf(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
return text
def index_uploaded_text(text):
global index
index = faiss.IndexFlatL2(embedding_dim)
st.session_state.chunks = []
st.session_state.chunk_sources = []
chunks_list = chunk_text(text)
st.session_state.chunks = chunks_list
for i, chunk in enumerate(chunks_list):
st.session_state.chunk_sources.append(f"Chunk {i+1}: {chunk[:50]}...")
vector = embedder.encode([chunk])[0]
index.add(np.array([vector]).astype('float32'))
return len(chunks_list)
def retrieve_chunks(query, top_k=5):
if index.ntotal == 0:
return []
q_vector = embedder.encode([query])
D, I = index.search(np.array(q_vector).astype('float32'), k=min(top_k, index.ntotal))
return [st.session_state.chunks[i] for i in I[0] if i < len(st.session_state.chunks)]
def build_prompt(system_prompt, context_chunks, question):
context = "\n\n".join(context_chunks)
return f"""{system_prompt}
Context:
{context}
Question:
{question}
Answer: Please provide a comprehensive answer based only on the context provided."""
def generate_answer(prompt):
api_key = get_api_key()
if not api_key:
return "API key is missing. Please set the GROQ_API_KEY environment variable or enter it in the sidebar."
headers = {
"Authorization": f"Bearer {api_key.strip()}",
"Content-Type": "application/json"
}
selected_model = st.session_state.get("MODEL_CHOICE", "llama-3.1-8b-instant")
payload = {
"model": selected_model,
"messages": [
{"role": "system", "content": "You are a helpful document assistant that answers questions only using the provided context."},
{"role": "user", "content": prompt}
],
"temperature": 0.3,
"max_tokens": 1024
}
try:
start_time = time.time()
with st.spinner("Sending request to Groq API..."):
response = requests.post(
"https://api.groq.com/openai/v1/chat/completions",
json=payload,
headers=headers,
timeout=30
)
query_time = time.time() - start_time
st.session_state.last_query_time = f"{query_time:.2f} seconds"
if response.status_code == 401:
return "Authentication failed: Invalid or expired API key."
if response.status_code == 400:
error_info = response.json().get("error", {})
error_message = error_info.get("message", "Unknown error")
if "model not found" in error_message.lower():
st.warning("Trying with alternate model...")
payload["model"] = "llama-3.1-8b-instant"
response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers)
if response.status_code != 200:
return f"Both model attempts failed. Error: {error_message}"
else:
return f"API Error: {error_message}"
response.raise_for_status()
response_json = response.json()
if "choices" not in response_json or not response_json["choices"]:
return "No answer was generated."
answer = response_json["choices"][0]["message"]["content"]
st.session_state.last_response = answer
return answer
except requests.exceptions.RequestException as e:
return f"API request failed: {str(e)}"
except Exception as e:
return f"Unexpected error: {str(e)}"
def translate_text(text, target_language):
try:
with st.spinner(f"Translating to {target_language}..."):
return GoogleTranslator(source='auto', target=target_language).translate(text)
except Exception as e:
st.error(f"Translation failed: {str(e)}")
return text
def text_to_speech(text, lang_code):
try:
with st.spinner("Generating audio..."):
tts = gTTS(text=text, lang=lang_code)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tts.save(temp_file.name)
return temp_file.name
except Exception as e:
st.error(f"Text-to-speech failed: {str(e)}")
return None
# Streamlit UI
st.title("๐ RAG Explorer: AI-Powered Document Assistant & Translator")
st.markdown("Upload a document and ask questions to get AI-powered answers with translation capabilities.")
# Add API key input in sidebar
with st.sidebar:
# Add model selection
st.subheader("Model Selection")
model_choice = st.selectbox(
"Select LLM Model",
[
"llama-3.1-8b-instant", # Changed default to a model known to work
"llama-3.3-70b-versatile"
],
help="Choose the Groq model to use for answering questions"
)
st.session_state["MODEL_CHOICE"] = model_choice
# Debug mode toggle
st.subheader("Debug Settings")
st.session_state.debug_mode = st.checkbox("Show Debug Information", value=st.session_state.debug_mode)
if st.session_state.last_query_time:
st.subheader("About")
st.markdown("""
This app uses Retrieval-Augmented Generation (RAG) to answer questions about uploaded documents.
1. Upload a document
2. Ask a question
3. Translate responses to other languages
""")
# Main content area
col1, col2 = st.columns([2, 1])
with col1:
uploaded_file = st.file_uploader("Upload a PDF or TXT file", type=["pdf", "txt"])
if uploaded_file:
with st.spinner("Reading and indexing document..."):
raw_text = ""
if uploaded_file.type == "application/pdf":
raw_text = extract_text_from_pdf(uploaded_file)
elif uploaded_file.type == "text/plain":
raw_text = uploaded_file.read().decode("utf-8")
total_chunks = index_uploaded_text(raw_text)
st.success(f"Document indexed successfully! Created {total_chunks} chunks.")
# Display document preview
with st.expander("Document Preview"):
# Extract and display key points
st.subheader("Key Points")
# Simple algorithm to extract potential key points (sentences that might be important)
sentences = raw_text.split('. ')
key_points = []
# Look for sentences that might be key points (contains keywords, not too long/short)
for sentence in sentences[:50]: # Check first 50 sentences
sentence = sentence.strip()
if len(sentence) > 15 and len(sentence) < 200: # Reasonable length for a key point
# Keywords that might indicate important information
important_keywords = ["important", "key", "significant", "main", "primary", "essential",
"critical", "crucial", "fundamental", "major", "summary", "conclusion"]
if any(keyword in sentence.lower() for keyword in important_keywords) or sentence.endswith(':'):
key_points.append(sentence)
# If we didn't find obvious key points, just take some representative sentences
if len(key_points) < 3:
key_points = [s.strip() for s in sentences[:50:10] if len(s.strip()) > 15][:5] # Every 10th sentence from first 50
# Display the key points as bullets
for point in key_points[:5]: # Show up to 5 key points
st.markdown(f"โข {point}")
if not key_points:
st.info("No clear key points detected. Try exploring the full document.")
with col2:
if st.session_state.chunks:
st.info(f"Document chunks: {len(st.session_state.chunks)}")
# Query and answer section
# Query and answer section
st.divider()
query = st.text_input("Ask a question about the document")
col1, col2 = st.columns([1, 1])
with col1:
enable_translation = st.checkbox("Translate answer", value=False)
use_local = st.checkbox("Use local processing (no API call)", value=False,
help="Use this if you're having API issues")
with col2:
language = st.selectbox("Language", ["English", "Urdu", "Hindi", "French", "Chinese", "Spanish", "German", "Arabic", "Russian"])
language_codes = {
"English": "en", "Urdu": "ur", "Hindi": "hi", "French": "fr", "Chinese": "zh-CN",
"Spanish": "es", "German": "de", "Arabic": "ar", "Russian": "ru"
}
lang_code = language_codes[language]
# Add a submit button
submit_button = st.button("Get Answer", type="primary", key="submit_query")
# Only process when the button is clicked and there's a query
if submit_button and query:
if index.ntotal == 0:
st.warning("Please upload and index a document first.")
else:
with st.spinner("Generating answer..."):
top_chunks = retrieve_chunks(query)
if not top_chunks:
st.error("No relevant content found.")
else:
system_prompt = "You are a document assistant. Use only the context to answer accurately."
prompt = build_prompt(system_prompt, top_chunks, query)
# Check API key before making call
if not get_api_key() and not use_local:
st.error("API key is not set. Please add it in the sidebar.")
else:
if use_local:
# Simple local processing that summarizes the chunks without API call
st.warning("Using local processing - limited functionality!")
answer = f"Local processing summary (no LLM used):\n\n"
answer += f"Question: {query}\n\n"
answer += "Here are the most relevant passages found:\n\n"
for i, chunk in enumerate(top_chunks[:3], 1):
answer += f"{i}. {chunk[:200]}...\n\n"
else:
answer = generate_answer(prompt)
# Display query and context if debug mode is on
if st.session_state.debug_mode:
with st.expander("Query Context", expanded=False):
st.write("Query:", query)
st.write("Top chunks used:")
for i, chunk in enumerate(top_chunks, 1):
st.write(f"{i}. {chunk[:100]}...")
# Create tabs for original and translated answers
tab1, tab2 = st.tabs(["Original Answer", f"Translated ({language})" if enable_translation else "Translation (disabled)"])
with tab1:
st.markdown("### Answer:")
st.write(answer)
with tab2:
if enable_translation and answer:
translated = translate_text(answer, lang_code)
st.markdown(f"### Answer ({language}):")
st.write(translated)
# Audio generation
audio_path = text_to_speech(translated, lang_code)
if audio_path:
st.audio(audio_path, format="audio/mp3")
else:
st.info("Enable translation to see the answer in your selected language.")
# Add footer
st.divider()
st.caption("RAG Document Assistant - Powered by Groq & Sentence Transformers") |