File size: 4,699 Bytes
3770496 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | import os
os.environ["STREAMLIT_SERVER_PORT"] = "8501"
os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
# Must be first Streamlit command
import streamlit as st
st.set_page_config(page_title="DocAnalyzer Pro", layout="wide")
# Import libraries
from transformers import pipeline
from PyPDF2 import PdfReader
import docx
import time
import psutil
from pathlib import Path
# ======================
# CACHE SETUP
# ======================
def setup_environment():
"""Ensure cache directories exist"""
cache_dir = Path("/app/models")
cache_dir.mkdir(exist_ok=True, parents=True)
return cache_dir
cache_dir = setup_environment()
# ======================
# MODEL LOADING
# ======================
@st.cache_resource(ttl=3600) # Cache for 1 hour
def load_models():
"""Load optimized models for Hugging Face Spaces"""
with st.spinner("π Loading AI models (this may take 1-2 minutes)..."):
return {
'qa': pipeline(
"question-answering",
model="distilbert-base-cased-distilled-squad",
device=-1 # Force CPU
),
'summarizer': pipeline(
"summarization",
model="sshleifer/distilbart-cnn-12-6",
device=-1 # Force CPU
)
}
models = load_models()
# ======================
# DOCUMENT PROCESSING
# ======================
def extract_text(file):
"""Extract text from PDF/DOCX files"""
try:
if file.type == "application/pdf":
reader = PdfReader(file)
return " ".join(page.extract_text() for page in reader.pages if page.extract_text())
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc = docx.Document(file)
return "\n".join(para.text for para in doc.paragraphs if para.text)
except Exception as e:
st.error(f"β οΈ Error processing document: {str(e)}")
return ""
# ======================
# CORE FUNCTIONS
# ======================
def generate_summary(text, max_length=150):
"""Generate summary with chunking for large documents"""
if not text:
return ""
try:
if len(text) > 10000: # Chunk large documents
chunks = [text[i:i+3000] for i in range(0, len(text), 3000)]
summaries = []
for chunk in chunks:
result = models['summarizer'](
chunk,
max_length=max_length//len(chunks),
min_length=30,
do_sample=False
)
summaries.append(result[0]['summary_text'])
return " ".join(summaries)
return models['summarizer'](text, max_length=max_length)[0]['summary_text']
except Exception as e:
st.error(f"β Summarization failed: {str(e)}")
return ""
# ======================
# STREAMLIT UI
# ======================
st.title("π DocAnalyzer Pro")
# File Upload Section
with st.expander("π€ Upload Document", expanded=True):
uploaded_file = st.file_uploader("Choose PDF/DOCX", type=["pdf", "docx"])
manual_text = st.text_area("Or paste text here:", height=150)
context = extract_text(uploaded_file) if uploaded_file else manual_text
# Main Features
tab1, tab2 = st.tabs(["π Question Answering", "π Summarization"])
with tab1:
if context:
question = st.text_input("Ask about the document:")
if question:
with st.spinner("Analyzing..."):
start_time = time.time()
result = models['qa'](question=question, context=context[:100000]) # Limit context size
st.success(f"Answered in {time.time()-start_time:.1f}s")
st.markdown(f"**Answer:** {result['answer']}")
st.progress(result['score'])
st.caption(f"Confidence: {result['score']:.0%}")
with tab2:
if context:
with st.form("summary_form"):
length = st.slider("Summary Length", 50, 300, 150)
if st.form_submit_button("Generate Summary"):
with st.spinner("Summarizing..."):
start_time = time.time()
summary = generate_summary(context, length)
st.success(f"Generated in {time.time()-start_time:.1f}s")
st.markdown(f"**Summary:**\n\n{summary}")
# System Info
with st.expander("βοΈ System Status"):
st.code(f"""
Models loaded: {', '.join(models.keys())}
Device: {'GPU β
' if torch.cuda.is_available() else 'CPU β οΈ'}
Memory: {psutil.virtual_memory().percent}% used
CPU: {psutil.cpu_percent()}% used
""") |