final_v2 / app.py
Amritpal Singh
Hello ji
3770496
import os
os.environ["STREAMLIT_SERVER_PORT"] = "8501"
os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
# Must be first Streamlit command
import streamlit as st
st.set_page_config(page_title="DocAnalyzer Pro", layout="wide")
# Import libraries
from transformers import pipeline
from PyPDF2 import PdfReader
import docx
import time
import psutil
from pathlib import Path
# ======================
# CACHE SETUP
# ======================
def setup_environment():
"""Ensure cache directories exist"""
cache_dir = Path("/app/models")
cache_dir.mkdir(exist_ok=True, parents=True)
return cache_dir
cache_dir = setup_environment()
# ======================
# MODEL LOADING
# ======================
@st.cache_resource(ttl=3600) # Cache for 1 hour
def load_models():
"""Load optimized models for Hugging Face Spaces"""
with st.spinner("πŸ”„ Loading AI models (this may take 1-2 minutes)..."):
return {
'qa': pipeline(
"question-answering",
model="distilbert-base-cased-distilled-squad",
device=-1 # Force CPU
),
'summarizer': pipeline(
"summarization",
model="sshleifer/distilbart-cnn-12-6",
device=-1 # Force CPU
)
}
models = load_models()
# ======================
# DOCUMENT PROCESSING
# ======================
def extract_text(file):
"""Extract text from PDF/DOCX files"""
try:
if file.type == "application/pdf":
reader = PdfReader(file)
return " ".join(page.extract_text() for page in reader.pages if page.extract_text())
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc = docx.Document(file)
return "\n".join(para.text for para in doc.paragraphs if para.text)
except Exception as e:
st.error(f"⚠️ Error processing document: {str(e)}")
return ""
# ======================
# CORE FUNCTIONS
# ======================
def generate_summary(text, max_length=150):
"""Generate summary with chunking for large documents"""
if not text:
return ""
try:
if len(text) > 10000: # Chunk large documents
chunks = [text[i:i+3000] for i in range(0, len(text), 3000)]
summaries = []
for chunk in chunks:
result = models['summarizer'](
chunk,
max_length=max_length//len(chunks),
min_length=30,
do_sample=False
)
summaries.append(result[0]['summary_text'])
return " ".join(summaries)
return models['summarizer'](text, max_length=max_length)[0]['summary_text']
except Exception as e:
st.error(f"❌ Summarization failed: {str(e)}")
return ""
# ======================
# STREAMLIT UI
# ======================
st.title("πŸ“„ DocAnalyzer Pro")
# File Upload Section
with st.expander("πŸ“€ Upload Document", expanded=True):
uploaded_file = st.file_uploader("Choose PDF/DOCX", type=["pdf", "docx"])
manual_text = st.text_area("Or paste text here:", height=150)
context = extract_text(uploaded_file) if uploaded_file else manual_text
# Main Features
tab1, tab2 = st.tabs(["πŸ” Question Answering", "πŸ“ Summarization"])
with tab1:
if context:
question = st.text_input("Ask about the document:")
if question:
with st.spinner("Analyzing..."):
start_time = time.time()
result = models['qa'](question=question, context=context[:100000]) # Limit context size
st.success(f"Answered in {time.time()-start_time:.1f}s")
st.markdown(f"**Answer:** {result['answer']}")
st.progress(result['score'])
st.caption(f"Confidence: {result['score']:.0%}")
with tab2:
if context:
with st.form("summary_form"):
length = st.slider("Summary Length", 50, 300, 150)
if st.form_submit_button("Generate Summary"):
with st.spinner("Summarizing..."):
start_time = time.time()
summary = generate_summary(context, length)
st.success(f"Generated in {time.time()-start_time:.1f}s")
st.markdown(f"**Summary:**\n\n{summary}")
# System Info
with st.expander("βš™οΈ System Status"):
st.code(f"""
Models loaded: {', '.join(models.keys())}
Device: {'GPU βœ…' if torch.cuda.is_available() else 'CPU ⚠️'}
Memory: {psutil.virtual_memory().percent}% used
CPU: {psutil.cpu_percent()}% used
""")