Spaces:

ChatBotsTA
/

pdf-summary

Sleeping

App Files Files Community

pdf-summary / app.py

ChatBotsTA

Update app.py

b4a6be5 verified 4 months ago

raw

history blame contribute delete

7.32 kB

	# app.py - Fully Local and Free
	import os
	import re
	import tempfile
	import requests
	import streamlit as st
	from PyPDF2 import PdfReader
	from typing import List
	import traceback

	# Local TTS library (requires system dependencies)
	try:
	import pyttsx3
	HAS_PYTTSX3 = True
	except Exception:
	HAS_PYTTSX3 = False

	# ============ CONFIG ============
	HUGGINGFACE_KEY = os.getenv("HUGGINGFACE_API_KEY", st.secrets.get("HUGGINGFACE_API_KEY"))
	HF_MERMAID_MODEL = os.getenv("HF_MERMAID_MODEL", "TroyDoesAI/MermaidStable3B")

	# ============ HELPERS ============
	def clean_text(text: str) -> str:
	return re.sub(r"\s+", " ", text or "").strip()

	def extract_text_from_pdf(uploaded_file) -> str:
	reader = PdfReader(uploaded_file)
	parts = [page.extract_text() for page in reader.pages if page.extract_text()]
	return clean_text(" ".join(parts))

	def local_summary(text: str, num_sentences: int = 6) -> str:
	if not text:
	return ""
	sentences = re.split(r'(?<=[.!?])\s+', text)
	words = re.findall(r'\w+', text.lower())
	stopwords = set(["the", "and", "is", "in", "to", "of", "a", "that", "it", "for"])
	freq = {w: words.count(w) for w in words if w not in stopwords and len(w) > 1}
	sent_scores = [(sum(freq.get(w, 0) for w in re.findall(r'\w+', s.lower())), s) for s in sentences]
	sent_scores.sort(reverse=True)
	chosen_sentences = sorted([s for _, s in sent_scores[:num_sentences]], key=text.find)
	return "\n".join(f"- {clean_text(s)}" for s in chosen_sentences if s.strip())

	def pyttsx3_tts_file(text: str):
	if not HAS_PYTTSX3:
	return False, "pyttsx3 not installed"
	try:
	engine = pyttsx3.init()
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
	engine.save_to_file(text, temp_file.name)
	engine.runAndWait()
	with open(temp_file.name, "rb") as f:
	return True, f.read()
	except Exception as e:
	return False, f"pyttsx3 TTS failed: {e}"

	def generate_mermaid_from_summary(summary: str):
	prompt = ("Create a concise Mermaid flowchart ('flowchart TD') from the following summary. "
	"Output only the Mermaid code block. Summary:\n" + summary)
	if HUGGINGFACE_KEY:
	url = f"https://api-inference.huggingface.co/models/{HF_MERMAID_MODEL}"
	headers = {"Authorization": f"Bearer {HUGGINGFACE_KEY}"}
	payload = {"inputs": prompt, "parameters": {"max_new_tokens": 512}}
	try:
	response = requests.post(url, headers=headers, json=payload, timeout=40)
	if response.ok and response.json():
	text = response.json()[0]['generated_text']
	match = re.search(r"```(?:mermaid)?\n([\s\S]+?)```", text)
	if match:
	return match.group(1).strip()
	except Exception:
	pass

	# Local fallback logic
	nodes = [re.sub(r'^- ', '', line).strip() for line in summary.split('\n') if line.strip()]
	if not nodes:
	return "graph TD\n A[Summary Empty]"
	mermaid_code = "graph TD\n"
	for i, node_text in enumerate(nodes[:8]):
	mermaid_code += f' A{i}["{node_text.replace('"', "'")[:60]}"]\n'
	for i in range(len(nodes[:8]) - 1):
	mermaid_code += f" A{i} --> A{i+1}\n"
	return mermaid_code

	def render_mermaid(mermaid_code: str):
	html_code = f"""
	<div class="mermaid">
	{mermaid_code}
	</div>
	<script src="https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"></script>
	<style>
	.mermaid-container {{ height: 420px; border: 1px solid #ddd; padding: 10px; border-radius: 8px; }}
	</style>
	"""
	st.components.v1.html(html_code, height=450, scrolling=True)

	def local_qa(text: str, query: str) -> str:
	# A simple, local Q&A function based on keyword matching.
	sentences = re.split(r'(?<=[.!?])\s+', text)
	query_words = set(re.findall(r'\w+', query.lower()))

	ranked_sentences = []
	for s in sentences:
	s_words = set(re.findall(r'\w+', s.lower()))
	score = len(query_words.intersection(s_words))
	if score > 0:
	ranked_sentences.append((score, s))

	ranked_sentences.sort(key=lambda x: x[0], reverse=True)

	if ranked_sentences:
	top_sentences = [s[1] for s in ranked_sentences[:3]]
	return " ".join(top_sentences)
	else:
	return "I couldn't find a relevant answer in the document."

	# ============ STREAMLIT UI ============
	st.set_page_config(page_title="PDF Assistant", layout="wide")
	st.title("📄 PDF Assistant: Summary, Diagram, Q&A")
	st.markdown("---")

	st.session_state.setdefault('raw_text', None)
	st.session_state.setdefault('summary', None)
	st.session_state.setdefault('mermaid_code', None)
	st.session_state.setdefault('chat_history', [])

	with st.sidebar:
	st.header("🔑 API Status")
	st.markdown(f"Hugging Face: {'✅ Key present' if HUGGINGFACE_KEY else '❌ Key missing. Diagram will be local.'}")
	st.markdown(f"Local TTS: {'✅ Active' if HAS_PYTTSX3 else '❌ Not available. Run `pip install pyttsx3`'}")

	uploaded_file = st.file_uploader("1. Upload a PDF", type=["pdf"])
	if uploaded_file and st.session_state.raw_text is None:
	with st.spinner("Extracting text..."):
	st.session_state.raw_text = extract_text_from_pdf(uploaded_file)
	if st.session_state.raw_text:
	st.success("Text extracted successfully!")
	else:
	st.warning("No text extracted from PDF. Is it a scanned image?")

	if st.session_state.raw_text:
	st.markdown("---")
	if st.button("2. Generate Summary & Diagram"):
	with st.spinner("Generating summary and diagram..."):
	st.session_state.summary = local_summary(st.session_state.raw_text)
	st.session_state.mermaid_code = generate_mermaid_from_summary(st.session_state.summary)
	st.success("Summary and diagram generated!")

	if st.session_state.summary:
	st.header("📌 Summary")
	st.markdown(st.session_state.summary)

	st.header("🗺️ Diagram")
	render_mermaid(st.session_state.mermaid_code)
	st.code(st.session_state.mermaid_code, language="mermaid")

	st.header("🔊 Audio")
	if st.button("Generate Audio"):
	with st.spinner("Generating audio..."):
	ok, out = pyttsx3_tts_file(st.session_state.summary)
	if ok:
	st.audio(out, format="audio/wav")
	st.info(f"Audio generated using: pyttsx3")
	else:
	st.error("Audio generation failed.")

	st.markdown("---")
	st.header("💬 Q&A Chatbot")
	for chat_message in st.session_state.chat_history:
	role, content = chat_message
	with st.chat_message(role):
	st.markdown(content)

	prompt = st.chat_input("Ask a question about the PDF")
	if prompt:
	st.session_state.chat_history.append(("user", prompt))
	with st.chat_message("user"):
	st.markdown(prompt)

	with st.chat_message("assistant"):
	with st.spinner("Thinking..."):
	answer = local_qa(st.session_state.raw_text, prompt)
	st.markdown(answer)
	st.session_state.chat_history.append(("assistant", answer))