File size: 4,699 Bytes
3770496
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
os.environ["STREAMLIT_SERVER_PORT"] = "8501"
os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"

# Must be first Streamlit command
import streamlit as st
st.set_page_config(page_title="DocAnalyzer Pro", layout="wide")

# Import libraries
from transformers import pipeline
from PyPDF2 import PdfReader
import docx
import time
import psutil
from pathlib import Path

# ======================
# CACHE SETUP
# ======================
def setup_environment():
    """Ensure cache directories exist"""
    cache_dir = Path("/app/models")
    cache_dir.mkdir(exist_ok=True, parents=True)
    return cache_dir

cache_dir = setup_environment()

# ======================
# MODEL LOADING
# ======================
@st.cache_resource(ttl=3600)  # Cache for 1 hour
def load_models():
    """Load optimized models for Hugging Face Spaces"""
    with st.spinner("πŸ”„ Loading AI models (this may take 1-2 minutes)..."):
        return {
            'qa': pipeline(
                "question-answering",
                model="distilbert-base-cased-distilled-squad",
                device=-1  # Force CPU
            ),
            'summarizer': pipeline(
                "summarization",
                model="sshleifer/distilbart-cnn-12-6",
                device=-1  # Force CPU
            )
        }

models = load_models()

# ======================
# DOCUMENT PROCESSING
# ======================
def extract_text(file):
    """Extract text from PDF/DOCX files"""
    try:
        if file.type == "application/pdf":
            reader = PdfReader(file)
            return " ".join(page.extract_text() for page in reader.pages if page.extract_text())
        elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
            doc = docx.Document(file)
            return "\n".join(para.text for para in doc.paragraphs if para.text)
    except Exception as e:
        st.error(f"⚠️ Error processing document: {str(e)}")
        return ""

# ======================
# CORE FUNCTIONS
# ======================
def generate_summary(text, max_length=150):
    """Generate summary with chunking for large documents"""
    if not text:
        return ""
    
    try:
        if len(text) > 10000:  # Chunk large documents
            chunks = [text[i:i+3000] for i in range(0, len(text), 3000)]
            summaries = []
            for chunk in chunks:
                result = models['summarizer'](
                    chunk,
                    max_length=max_length//len(chunks),
                    min_length=30,
                    do_sample=False
                )
                summaries.append(result[0]['summary_text'])
            return " ".join(summaries)
        return models['summarizer'](text, max_length=max_length)[0]['summary_text']
    except Exception as e:
        st.error(f"❌ Summarization failed: {str(e)}")
        return ""

# ======================
# STREAMLIT UI
# ======================
st.title("πŸ“„ DocAnalyzer Pro")

# File Upload Section
with st.expander("πŸ“€ Upload Document", expanded=True):
    uploaded_file = st.file_uploader("Choose PDF/DOCX", type=["pdf", "docx"])
    manual_text = st.text_area("Or paste text here:", height=150)
    context = extract_text(uploaded_file) if uploaded_file else manual_text

# Main Features
tab1, tab2 = st.tabs(["πŸ” Question Answering", "πŸ“ Summarization"])

with tab1:
    if context:
        question = st.text_input("Ask about the document:")
        if question:
            with st.spinner("Analyzing..."):
                start_time = time.time()
                result = models['qa'](question=question, context=context[:100000])  # Limit context size
                st.success(f"Answered in {time.time()-start_time:.1f}s")
                st.markdown(f"**Answer:** {result['answer']}")
                st.progress(result['score'])
                st.caption(f"Confidence: {result['score']:.0%}")

with tab2:
    if context:
        with st.form("summary_form"):
            length = st.slider("Summary Length", 50, 300, 150)
            if st.form_submit_button("Generate Summary"):
                with st.spinner("Summarizing..."):
                    start_time = time.time()
                    summary = generate_summary(context, length)
                    st.success(f"Generated in {time.time()-start_time:.1f}s")
                    st.markdown(f"**Summary:**\n\n{summary}")

# System Info
with st.expander("βš™οΈ System Status"):
    st.code(f"""
    Models loaded: {', '.join(models.keys())}
    Device: {'GPU βœ…' if torch.cuda.is_available() else 'CPU ⚠️'}
    Memory: {psutil.virtual_memory().percent}% used
    CPU: {psutil.cpu_percent()}% used
    """)