Spaces:

amritn8
/

FINALLL

Sleeping

App Files Files Community

amritn8 commited on Jun 12, 2025

Commit

ecf4e97

verified ·

1 Parent(s): d80c9f5

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -80

app.py CHANGED Viewed

@@ -1,116 +1,149 @@
 import streamlit as st
 import torch
-from transformers import pipeline, BartForConditionalGeneration, BartTokenizer
 from PyPDF2 import PdfReader
 import docx
-import os
-from time import time
-# Configure environment
-cache_dir = os.path.join(os.getcwd(), "model_cache")
-os.makedirs(cache_dir, exist_ok=True)
-os.environ["TRANSFORMERS_CACHE"] = cache_dir
-# ----------------------------
 # MODEL LOADING
-# ----------------------------
-@st.cache_resource(show_spinner=False)
 def load_models():
-    """Load all models with progress tracking"""
     models = {}
-    with st.spinner("🚀 Loading QA Model..."):
-        models['qa'] = pipeline(
-            "question-answering",
-            model="deepset/roberta-base-squad2",
-            device=0 if torch.cuda.is_available() else -1
-        )
-    with st.spinner("📝 Loading Summarization Model..."):
-        models['summarizer'] = pipeline(
-            "summarization",
-            model="facebook/bart-large-cnn",
-            tokenizer="facebook/bart-large-cnn",
-            device=0 if torch.cuda.is_available() else -1
-        )
     return models
 models = load_models()
-# ----------------------------
 # DOCUMENT PROCESSING
-# ----------------------------
 def extract_text(file):
     """Universal text extractor for PDF/DOCX"""
-    if file.type == "application/pdf":
-        reader = PdfReader(file)
-        return " ".join([page.extract_text() for page in reader.pages])
-    elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
-        doc = docx.Document(file)
-        return "\n".join(para.text for para in doc.paragraphs if para.text)
-    return ""
-# ----------------------------
-# SUMMARIZATION FUNCTION
-# ----------------------------
-def summarize(text, max_length=150, min_length=30):
-    """Advanced summarization with chunking for long documents"""
     try:
-        if len(text.split()) > 1000:  # Chunking for large documents
             chunks = [text[i:i+3000] for i in range(0, len(text), 3000)]
             summaries = []
             for chunk in chunks:
-                summary = models['summarizer'](
                     chunk,
-                    max_length=max_length,
-                    min_length=min_length,
                     do_sample=False
                 )
-                summaries.append(summary[0]['summary_text'])
             return " ".join(summaries)
-        return models['summarizer'](text, max_length=max_length, min_length=min_length)[0]['summary_text']
     except Exception as e:
-        st.error(f"Summarization error: {str(e)}")
         return ""
-# ----------------------------
 # STREAMLIT UI
-# ----------------------------
-st.title("📚 Document Intelligence Suite")
-# Main Document Input
-with st.expander("📄 Upload Document", expanded=True):
     uploaded_file = st.file_uploader("Choose PDF/DOCX", type=["pdf", "docx"])
-    manual_text = st.text_area("Or paste raw text here:", height=150)
     context = extract_text(uploaded_file) if uploaded_file else manual_text
-# ----------------------------
-# ADVANCED FEATURES
-# ----------------------------
-with st.expander("🔧 Advanced Tools", expanded=False):
-    st.header("📝 Document Summarization")
-    if st.button("Generate Summary"):
-        if not context:
-            st.warning("Please provide content first")
-        else:
-            with st.spinner("Analyzing document..."):
-                start_time = time()
-                summary = summarize(context)
-                st.success(f"Generated in {time()-start_time:.1f}s")
-                st.markdown(f"**Summary:**\n\n{summary}")
-    st.header("⚙️ Customization")
-    max_len = st.slider("Summary Length", 50, 300, 150)
-    show_chunks = st.checkbox("Show processing chunks", False)
-# Question Answering Section
-if context:
-    st.header("❓ Question Answering")
-    question = st.text_input("Ask about the document:")
-    if question:
-        with st.spinner("Searching for answers..."):
-            result = models['qa'](question=question, context=context[:100000])  # 100k char limit
-            st.markdown(f"**Answer:** {result['answer']}")
-            st.caption(f"Confidence: {result['score']:.0%}")

 import streamlit as st
 import torch
+from transformers import pipeline
+import os
+import shutil
+from pathlib import Path
 from PyPDF2 import PdfReader
 import docx
+import time
+# ======================
+# CACHE CONFIGURATION
+# ======================
+def setup_environment():
+    """Configure cache with guaranteed write permissions"""
+    cache_dir = Path("/tmp/model_cache")
+    lock_dir = cache_dir / ".locks"
+    # Clear any existing locks
+    if lock_dir.exists():
+        shutil.rmtree(lock_dir, ignore_errors=True)
+    cache_dir.mkdir(exist_ok=True, parents=True)
+    os.environ["TRANSFORMERS_CACHE"] = str(cache_dir)
+    os.environ["HF_HOME"] = str(cache_dir)
+    return cache_dir
+cache_dir = setup_environment()
+# ======================
 # MODEL LOADING
+# ======================
+@st.cache_resource(ttl=3600)  # Cache for 1 hour
 def load_models():
+    """Load all NLP models with error recovery"""
     models = {}
+    try:
+        # Question Answering
+        with st.spinner("🔍 Loading QA Model..."):
+            models['qa'] = pipeline(
+                "question-answering",
+                model="deepset/roberta-base-squad2",
+                device=0 if torch.cuda.is_available() else -1
+            )
+        # Summarization
+        with st.spinner("📝 Loading Summarizer..."):
+            models['summarizer'] = pipeline(
+                "summarization",
+                model="facebook/bart-large-cnn",
+                device=0 if torch.cuda.is_available() else -1
+            )
+    except Exception as e:
+        st.error(f"❌ Model loading failed: {str(e)}")
+        st.stop()
     return models
 models = load_models()
+# ======================
 # DOCUMENT PROCESSING
+# ======================
 def extract_text(file):
     """Universal text extractor for PDF/DOCX"""
+    try:
+        if file.type == "application/pdf":
+            reader = PdfReader(file)
+            return " ".join(page.extract_text() for page in reader.pages if page.extract_text())
+        elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+            doc = docx.Document(file)
+            return "\n".join(para.text for para in doc.paragraphs if para.text)
+    except Exception as e:
+        st.error(f"Document processing error: {str(e)}")
+        return ""
+# ======================
+# CORE FUNCTIONS
+# ======================
+def generate_summary(text, max_length=150):
+    """Chunk-aware summarization"""
+    if not text:
+        return ""
     try:
+        if len(text) > 10000:  # Chunk large documents
             chunks = [text[i:i+3000] for i in range(0, len(text), 3000)]
             summaries = []
             for chunk in chunks:
+                result = models['summarizer'](
                     chunk,
+                    max_length=max_length//len(chunks),
+                    min_length=30,
                     do_sample=False
                 )
+                summaries.append(result[0]['summary_text'])
             return " ".join(summaries)
+        return models['summarizer'](text, max_length=max_length)[0]['summary_text']
     except Exception as e:
+        st.error(f"Summarization failed: {str(e)}")
         return ""
+# ======================
 # STREAMLIT UI
+# ======================
+st.set_page_config(page_title="DocAnalyzer Pro", layout="wide")
+st.title("📄 Document Analyzer Pro")
+# File Upload
+with st.expander("📤 Upload Document", expanded=True):
     uploaded_file = st.file_uploader("Choose PDF/DOCX", type=["pdf", "docx"])
+    manual_text = st.text_area("Or paste text here:", height=200)
     context = extract_text(uploaded_file) if uploaded_file else manual_text
+# Main Features
+tab1, tab2 = st.tabs(["🔍 Question Answering", "📝 Summarization"])
+with tab1:
+    if context:
+        question = st.text_input("Ask about the document:")
+        if question:
+            with st.spinner("Analyzing..."):
+                start = time.time()
+                result = models['qa'](question=question, context=context[:100000])
+                st.success(f"Answered in {time.time()-start:.1f}s")
+                st.markdown(f"**Answer:** {result['answer']}")
+                st.progress(result['score'])
+                st.caption(f"Confidence: {result['score']:.0%}")
+with tab2:
+    if context:
+        with st.form("summary_form"):
+            length = st.slider("Summary Length", 50, 300, 150)
+            if st.form_submit_button("Generate Summary"):
+                with st.spinner("Summarizing..."):
+                    start = time.time()
+                    summary = generate_summary(context, length)
+                    st.success(f"Generated in {time.time()-start:.1f}s")
+                    st.markdown(f"**Summary:**\n\n{summary}")
+# Debug Info
+with st.expander("⚙️ System Info"):
+    st.code(f"""
+    Cache directory: {cache_dir}
+    Device: {'GPU ✅' if torch.cuda.is_available() else 'CPU ⚠️'}
+    Models loaded: {', '.join(models.keys())}
+    """)