import os os.environ["STREAMLIT_SERVER_PORT"] = "8501" os.environ["STREAMLIT_SERVER_HEADLESS"] = "true" # Must be first Streamlit command import streamlit as st st.set_page_config(page_title="DocAnalyzer Pro", layout="wide") # Import libraries from transformers import pipeline from PyPDF2 import PdfReader import docx import time import psutil from pathlib import Path # ====================== # CACHE SETUP # ====================== def setup_environment(): """Ensure cache directories exist""" cache_dir = Path("/app/models") cache_dir.mkdir(exist_ok=True, parents=True) return cache_dir cache_dir = setup_environment() # ====================== # MODEL LOADING # ====================== @st.cache_resource(ttl=3600) # Cache for 1 hour def load_models(): """Load optimized models for Hugging Face Spaces""" with st.spinner("🔄 Loading AI models (this may take 1-2 minutes)..."): return { 'qa': pipeline( "question-answering", model="distilbert-base-cased-distilled-squad", device=-1 # Force CPU ), 'summarizer': pipeline( "summarization", model="sshleifer/distilbart-cnn-12-6", device=-1 # Force CPU ) } models = load_models() # ====================== # DOCUMENT PROCESSING # ====================== def extract_text(file): """Extract text from PDF/DOCX files""" try: if file.type == "application/pdf": reader = PdfReader(file) return " ".join(page.extract_text() for page in reader.pages if page.extract_text()) elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": doc = docx.Document(file) return "\n".join(para.text for para in doc.paragraphs if para.text) except Exception as e: st.error(f"⚠️ Error processing document: {str(e)}") return "" # ====================== # CORE FUNCTIONS # ====================== def generate_summary(text, max_length=150): """Generate summary with chunking for large documents""" if not text: return "" try: if len(text) > 10000: # Chunk large documents chunks = [text[i:i+3000] for i in range(0, len(text), 3000)] summaries = [] for chunk in chunks: result = models['summarizer']( chunk, max_length=max_length//len(chunks), min_length=30, do_sample=False ) summaries.append(result[0]['summary_text']) return " ".join(summaries) return models['summarizer'](text, max_length=max_length)[0]['summary_text'] except Exception as e: st.error(f"❌ Summarization failed: {str(e)}") return "" # ====================== # STREAMLIT UI # ====================== st.title("📄 DocAnalyzer Pro") # File Upload Section with st.expander("📤 Upload Document", expanded=True): uploaded_file = st.file_uploader("Choose PDF/DOCX", type=["pdf", "docx"]) manual_text = st.text_area("Or paste text here:", height=150) context = extract_text(uploaded_file) if uploaded_file else manual_text # Main Features tab1, tab2 = st.tabs(["🔍 Question Answering", "📝 Summarization"]) with tab1: if context: question = st.text_input("Ask about the document:") if question: with st.spinner("Analyzing..."): start_time = time.time() result = models['qa'](question=question, context=context[:100000]) # Limit context size st.success(f"Answered in {time.time()-start_time:.1f}s") st.markdown(f"**Answer:** {result['answer']}") st.progress(result['score']) st.caption(f"Confidence: {result['score']:.0%}") with tab2: if context: with st.form("summary_form"): length = st.slider("Summary Length", 50, 300, 150) if st.form_submit_button("Generate Summary"): with st.spinner("Summarizing..."): start_time = time.time() summary = generate_summary(context, length) st.success(f"Generated in {time.time()-start_time:.1f}s") st.markdown(f"**Summary:**\n\n{summary}") # System Info with st.expander("⚙️ System Status"): st.code(f""" Models loaded: {', '.join(models.keys())} Device: {'GPU ✅' if torch.cuda.is_available() else 'CPU ⚠️'} Memory: {psutil.virtual_memory().percent}% used CPU: {psutil.cpu_percent()}% used """)