Spaces:
Sleeping
Sleeping
| # app.py - Fully Local and Free | |
| import os | |
| import re | |
| import tempfile | |
| import requests | |
| import streamlit as st | |
| from PyPDF2 import PdfReader | |
| from typing import List | |
| import traceback | |
| # Local TTS library (requires system dependencies) | |
| try: | |
| import pyttsx3 | |
| HAS_PYTTSX3 = True | |
| except Exception: | |
| HAS_PYTTSX3 = False | |
| # ============ CONFIG ============ | |
| HUGGINGFACE_KEY = os.getenv("HUGGINGFACE_API_KEY", st.secrets.get("HUGGINGFACE_API_KEY")) | |
| HF_MERMAID_MODEL = os.getenv("HF_MERMAID_MODEL", "TroyDoesAI/MermaidStable3B") | |
| # ============ HELPERS ============ | |
| def clean_text(text: str) -> str: | |
| return re.sub(r"\s+", " ", text or "").strip() | |
| def extract_text_from_pdf(uploaded_file) -> str: | |
| reader = PdfReader(uploaded_file) | |
| parts = [page.extract_text() for page in reader.pages if page.extract_text()] | |
| return clean_text(" ".join(parts)) | |
| def local_summary(text: str, num_sentences: int = 6) -> str: | |
| if not text: | |
| return "" | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| words = re.findall(r'\w+', text.lower()) | |
| stopwords = set(["the", "and", "is", "in", "to", "of", "a", "that", "it", "for"]) | |
| freq = {w: words.count(w) for w in words if w not in stopwords and len(w) > 1} | |
| sent_scores = [(sum(freq.get(w, 0) for w in re.findall(r'\w+', s.lower())), s) for s in sentences] | |
| sent_scores.sort(reverse=True) | |
| chosen_sentences = sorted([s for _, s in sent_scores[:num_sentences]], key=text.find) | |
| return "\n".join(f"- {clean_text(s)}" for s in chosen_sentences if s.strip()) | |
| def pyttsx3_tts_file(text: str): | |
| if not HAS_PYTTSX3: | |
| return False, "pyttsx3 not installed" | |
| try: | |
| engine = pyttsx3.init() | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") | |
| engine.save_to_file(text, temp_file.name) | |
| engine.runAndWait() | |
| with open(temp_file.name, "rb") as f: | |
| return True, f.read() | |
| except Exception as e: | |
| return False, f"pyttsx3 TTS failed: {e}" | |
| def generate_mermaid_from_summary(summary: str): | |
| prompt = ("Create a concise Mermaid flowchart ('flowchart TD') from the following summary. " | |
| "Output only the Mermaid code block. Summary:\n" + summary) | |
| if HUGGINGFACE_KEY: | |
| url = f"https://api-inference.huggingface.co/models/{HF_MERMAID_MODEL}" | |
| headers = {"Authorization": f"Bearer {HUGGINGFACE_KEY}"} | |
| payload = {"inputs": prompt, "parameters": {"max_new_tokens": 512}} | |
| try: | |
| response = requests.post(url, headers=headers, json=payload, timeout=40) | |
| if response.ok and response.json(): | |
| text = response.json()[0]['generated_text'] | |
| match = re.search(r"```(?:mermaid)?\n([\s\S]+?)```", text) | |
| if match: | |
| return match.group(1).strip() | |
| except Exception: | |
| pass | |
| # Local fallback logic | |
| nodes = [re.sub(r'^- ', '', line).strip() for line in summary.split('\n') if line.strip()] | |
| if not nodes: | |
| return "graph TD\n A[Summary Empty]" | |
| mermaid_code = "graph TD\n" | |
| for i, node_text in enumerate(nodes[:8]): | |
| mermaid_code += f' A{i}["{node_text.replace('"', "'")[:60]}"]\n' | |
| for i in range(len(nodes[:8]) - 1): | |
| mermaid_code += f" A{i} --> A{i+1}\n" | |
| return mermaid_code | |
| def render_mermaid(mermaid_code: str): | |
| html_code = f""" | |
| <div class="mermaid"> | |
| {mermaid_code} | |
| </div> | |
| <script src="https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"></script> | |
| <style> | |
| .mermaid-container {{ height: 420px; border: 1px solid #ddd; padding: 10px; border-radius: 8px; }} | |
| </style> | |
| """ | |
| st.components.v1.html(html_code, height=450, scrolling=True) | |
| def local_qa(text: str, query: str) -> str: | |
| # A simple, local Q&A function based on keyword matching. | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| query_words = set(re.findall(r'\w+', query.lower())) | |
| ranked_sentences = [] | |
| for s in sentences: | |
| s_words = set(re.findall(r'\w+', s.lower())) | |
| score = len(query_words.intersection(s_words)) | |
| if score > 0: | |
| ranked_sentences.append((score, s)) | |
| ranked_sentences.sort(key=lambda x: x[0], reverse=True) | |
| if ranked_sentences: | |
| top_sentences = [s[1] for s in ranked_sentences[:3]] | |
| return " ".join(top_sentences) | |
| else: | |
| return "I couldn't find a relevant answer in the document." | |
| # ============ STREAMLIT UI ============ | |
| st.set_page_config(page_title="PDF Assistant", layout="wide") | |
| st.title("π PDF Assistant: Summary, Diagram, Q&A") | |
| st.markdown("---") | |
| st.session_state.setdefault('raw_text', None) | |
| st.session_state.setdefault('summary', None) | |
| st.session_state.setdefault('mermaid_code', None) | |
| st.session_state.setdefault('chat_history', []) | |
| with st.sidebar: | |
| st.header("π API Status") | |
| st.markdown(f"**Hugging Face:** {'β Key present' if HUGGINGFACE_KEY else 'β Key missing. Diagram will be local.'}") | |
| st.markdown(f"**Local TTS:** {'β Active' if HAS_PYTTSX3 else 'β Not available. Run `pip install pyttsx3`'}") | |
| uploaded_file = st.file_uploader("1. Upload a PDF", type=["pdf"]) | |
| if uploaded_file and st.session_state.raw_text is None: | |
| with st.spinner("Extracting text..."): | |
| st.session_state.raw_text = extract_text_from_pdf(uploaded_file) | |
| if st.session_state.raw_text: | |
| st.success("Text extracted successfully!") | |
| else: | |
| st.warning("No text extracted from PDF. Is it a scanned image?") | |
| if st.session_state.raw_text: | |
| st.markdown("---") | |
| if st.button("2. Generate Summary & Diagram"): | |
| with st.spinner("Generating summary and diagram..."): | |
| st.session_state.summary = local_summary(st.session_state.raw_text) | |
| st.session_state.mermaid_code = generate_mermaid_from_summary(st.session_state.summary) | |
| st.success("Summary and diagram generated!") | |
| if st.session_state.summary: | |
| st.header("π Summary") | |
| st.markdown(st.session_state.summary) | |
| st.header("πΊοΈ Diagram") | |
| render_mermaid(st.session_state.mermaid_code) | |
| st.code(st.session_state.mermaid_code, language="mermaid") | |
| st.header("π Audio") | |
| if st.button("Generate Audio"): | |
| with st.spinner("Generating audio..."): | |
| ok, out = pyttsx3_tts_file(st.session_state.summary) | |
| if ok: | |
| st.audio(out, format="audio/wav") | |
| st.info(f"Audio generated using: **pyttsx3**") | |
| else: | |
| st.error("Audio generation failed.") | |
| st.markdown("---") | |
| st.header("π¬ Q&A Chatbot") | |
| for chat_message in st.session_state.chat_history: | |
| role, content = chat_message | |
| with st.chat_message(role): | |
| st.markdown(content) | |
| prompt = st.chat_input("Ask a question about the PDF") | |
| if prompt: | |
| st.session_state.chat_history.append(("user", prompt)) | |
| with st.chat_message("user"): | |
| st.markdown(prompt) | |
| with st.chat_message("assistant"): | |
| with st.spinner("Thinking..."): | |
| answer = local_qa(st.session_state.raw_text, prompt) | |
| st.markdown(answer) | |
| st.session_state.chat_history.append(("assistant", answer)) |