Spaces:

HugMeBytes
/

SmartPDF_Q-A_Bot

Sleeping

App Files Files Community

SmartPDF_Q-A_Bot / app.py

HugMeBytes

Update app.py

7384705 verified 5 months ago

raw

history blame contribute delete

6.98 kB

	import gradio as gr
	import fitz # PyMuPDF
	import os
	import tempfile
	import requests
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from datetime import datetime

	# === CONFIG CHECK ===
	if not os.getenv("GROQ_API_KEY"):
	print("WARNING: GROQ_API_KEY environment variable not set. API calls will fail.")

	# === Globals ===
	vectorizer = TfidfVectorizer(stop_words='english')

	# === UTILITY FUNCTIONS ===
	""" def call_groq_api(prompt):
	api_key = os.getenv("GROQ_API_KEY")
	if not api_key:
	return "Error: GROQ_API_KEY environment variable not set."

	headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
	data = {"model": "llama-3.1-70b-versatile", "messages": [{"role": "user", "content": prompt}]}

	try:
	response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=data, headers=headers)
	if response.status_code != 200:
	return f"API Error {response.status_code}: {response.text}"
	return response.json()["choices"][0]["message"]["content"]

	except requests.exceptions.RequestException as e:
	return f"API Error: {str(e)}"
	except (KeyError, IndexError) as e:
	return f"Error parsing API response: {str(e)}"
	"""

	def call_groq_api(prompt):
	api_key = os.getenv("GROQ_API_KEY")
	if not api_key:
	return "Error: GROQ_API_KEY environment variable not set."

	headers = {
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json"
	}

	data = {
	"model": "llama-3.3-70b-versatile",
	"messages": [{"role": "user", "content": prompt}],
	"temperature": 0.7
	}


	try:
	response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=data, headers=headers)
	if response.status_code != 200:
	return f"API Error {response.status_code}: {response.text}"
	result = response.json()
	return result["choices"][0]["message"]["content"]
	except requests.exceptions.RequestException as e:
	return f"Network Error: {e}"
	except Exception as e:
	return f"Unexpected Error: {e}"


	def extract_text_from_pdfs(pdf_files):
	chunks, pages, file_names = [], [], []
	for file in pdf_files:
	try:
	doc = fitz.open(file.name)
	for page_num, page in enumerate(doc, start=1):
	text = page.get_text().strip()
	if text:
	chunks.append(text)
	pages.append(page_num)
	file_names.append(os.path.basename(file.name))
	except Exception as e:
	print(f"Error processing {file.name}: {e}")
	return chunks, pages, file_names

	def retrieve_context(query, chunks, pages, file_names, top_k=3):
	all_texts = chunks + [query]
	tfidf_matrix = vectorizer.fit_transform(all_texts)
	query_vec = tfidf_matrix[-1]
	similarities = cosine_similarity(query_vec, tfidf_matrix[:-1]).flatten()

	if max(similarities) < 0.2:
	return "Ask a relevant question.", [], []

	top_indices = similarities.argsort()[-top_k:][::-1]
	selected_chunks = [chunks[i] for i in top_indices]
	references = [f"{file_names[i]} (p.{pages[i]})" for i in top_indices]
	return "\n".join(selected_chunks), selected_chunks, references


	def download_chat(chat_history):
	if not chat_history:
	return None
	timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
	filename = f"chat_{timestamp}.txt"
	path = os.path.join(tempfile.gettempdir(), filename)
	with open(path, "w", encoding="utf-8") as f:
	for q, a in chat_history:
	f.write(f"Q: {q}\nA: {a}\n\n")
	return path


	# === Main Q&A Logic ===
	def answer_question(text_input, pdf_files, chat_history):
	if chat_history is None:
	chat_history = []

	if not text_input:
	return "❗ Please type a question.", chat_history, chat_history
	if not pdf_files:
	return "❗ Please upload PDF files first.", chat_history, chat_history

	chunks, pages, file_names = extract_text_from_pdfs(pdf_files)
	if not chunks:
	return "❗ Could not extract text from PDFs.", chat_history, chat_history

	context, matched_chunks, references = retrieve_context(text_input, chunks, pages, file_names)

	if context == "Ask a relevant question.":
	response = "⚠️ Ask a relevant question based on the PDFs."
	chat_history.append([text_input, response])
	return response, chat_history, chat_history

	prompt = f"Answer the question using this context:\n\n{context}\n\nQuestion: {text_input}\n\nAnswer:"
	answer = call_groq_api(prompt)
	full_answer = f"{answer}\n\n📌 Sources: {', '.join(references)}"
	chat_history.append([text_input, full_answer])
	return full_answer, chat_history, chat_history


	# === Custom CSS ===
	custom_css = """
	.gradio-container {
	max-width: 900px !important;
	margin: auto;
	font-family: 'Segoe UI', sans-serif;
	}

	body {
	background-color: var(--background-primary);
	color: var(--body-text-color);
	}

	textarea, input, button {
	font-family: 'Segoe UI', sans-serif !important;
	}
	"""

	# === Launch UI ===
	with gr.Blocks(css=custom_css, theme=gr.themes.Base()) as demo:
	gr.Markdown("""
	# 🧠 SmartPDF Q&A Bot
	_Ask questions from your PDFs. Get answers with page references. Download chat history._
	""", elem_id="title")

	chat_state = gr.State([])

	with gr.Tabs():
	with gr.Tab("📂 Upload PDFs"):
	gr.Markdown("### Step 1: Upload one or more PDF documents.")
	pdf_input = gr.File(label="📁 Upload PDF Files", file_types=[".pdf"], file_count="multiple")

	with gr.Tab("💬 Ask Questions"):
	gr.Markdown("### Step 2: Ask a question about the uploaded documents.")
	with gr.Row():
	text_input = gr.Textbox(label="❓ Type your question here", placeholder="e.g. What is the main idea of the first document?", lines=2)
	ask_btn = gr.Button("🔍 Ask")

	answer_output = gr.Textbox(label="🧠 Answer", lines=6)
	chatbox = gr.Dataframe(headers=["User", "Bot"], label="💬 Chat History", interactive=False)

	with gr.Tab("📥 Export Chat History"):
	gr.Markdown("### Step 3: Download your chat session.")
	download_btn = gr.Button("⬇️ Download Chat History")
	download_file = gr.File(label="📄 Your Chat File", visible=False)

	# === Button Event Binding ===
	ask_btn.click(
	answer_question,
	inputs=[text_input, pdf_input, chat_state],
	outputs=[answer_output, chatbox, chat_state]
	)

	download_btn.click(
	download_chat,
	inputs=[chat_state],
	outputs=download_file
	).then(lambda: gr.update(visible=True), None, [download_file])


	if __name__ == "__main__":
	demo.launch(share=True)