Spaces:

amritn8
/

FINALLL

Sleeping

App Files Files Community

amritn8 commited on Jun 12, 2025

Commit

d80c9f5

verified ·

1 Parent(s): 56735e5

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -70

app.py CHANGED Viewed

@@ -1,92 +1,116 @@
 import streamlit as st
 import torch
-import os
-from transformers import pipeline
-import fitz  # PyMuPDF
 import docx
 from time import time
-# Configure logging
-import logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# ----------------------------
-# SETUP & MODEL LOAD
-# ----------------------------
-st.set_page_config(page_title="Fast QA App", layout="wide")
-st.title("🧠 Instant Question Answering")
-# Set cache directory
 cache_dir = os.path.join(os.getcwd(), "model_cache")
 os.makedirs(cache_dir, exist_ok=True)
 os.environ["TRANSFORMERS_CACHE"] = cache_dir
-# Load model with progress indicator
-@st.cache_resource(show_spinner="Loading AI model...")
-def load_qa_model():
-    logger.info(f"Loading model at {time()}")
-    return pipeline(
-        "question-answering",
-        model="distilbert-base-uncased-distilled-squad",  # Faster alternative
-        device=0 if torch.cuda.is_available() else -1
-    )
-qa_pipeline = load_qa_model()
-st.success("Model loaded successfully!")
 # ----------------------------
-# TEXT EXTRACTION FUNCTIONS
 # ----------------------------
-def extract_text_from_pdf(uploaded_file):
-    with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
-        return " ".join(page.get_text() for page in doc)
-def extract_text_from_docx(uploaded_file):
-    doc = docx.Document(uploaded_file)
-    return "\n".join(para.text for para in doc.paragraphs if para.text)
 # ----------------------------
 # STREAMLIT UI
 # ----------------------------
-with st.form("qa_form"):
-    st.subheader("📄 Document Input")
-    uploaded_file = st.file_uploader("Upload PDF/DOCX", type=["pdf", "docx"])
-    manual_text = st.text_area("Or paste text here:", height=150)
-    st.subheader("❓ Question Input")
-    question = st.text_input("Enter your question:")
-    submit_btn = st.form_submit_button("Get Answer")
-if submit_btn:
-    context = ""
-    if uploaded_file:
-        file_type = uploaded_file.name.split(".")[-1].lower()
-        if file_type == "pdf":
-            context = extract_text_from_pdf(uploaded_file)
-        elif file_type == "docx":
-            context = extract_text_from_docx(uploaded_file)
-    else:
-        context = manual_text
-    if not context:
-        st.warning("Please provide either a document or text input")
-    elif not question:
-        st.warning("Please enter a question")
-    else:
-        with st.spinner("Analyzing content..."):
-            try:
-                result = qa_pipeline(question=question, context=context[:10000])  # Limit context length
-                st.markdown(f"### ✅ Answer: {result['answer']}")
-                st.progress(result["score"])  # Show confidence score
-                st.caption(f"Confidence: {result['score']:.0%}")
-            except Exception as e:
-                st.error(f"Error processing request: {str(e)}")
 # ----------------------------
-# ADVANCED SECTION
 # ----------------------------
-with st.expander("⚙️ Advanced Options"):
-    st.subheader("Model Information")
-    st.code(f"Using: distilbert-base-uncased-distilled-squad")
-    st.caption("Optimized for fast inference on limited resources")

 import streamlit as st
 import torch
+from transformers import pipeline, BartForConditionalGeneration, BartTokenizer
+from PyPDF2 import PdfReader
 import docx
+import os
 from time import time
+# Configure environment
 cache_dir = os.path.join(os.getcwd(), "model_cache")
 os.makedirs(cache_dir, exist_ok=True)
 os.environ["TRANSFORMERS_CACHE"] = cache_dir
+# ----------------------------
+# MODEL LOADING
+# ----------------------------
+@st.cache_resource(show_spinner=False)
+def load_models():
+    """Load all models with progress tracking"""
+    models = {}
+    with st.spinner("🚀 Loading QA Model..."):
+        models['qa'] = pipeline(
+            "question-answering",
+            model="deepset/roberta-base-squad2",
+            device=0 if torch.cuda.is_available() else -1
+        )
+    with st.spinner("📝 Loading Summarization Model..."):
+        models['summarizer'] = pipeline(
+            "summarization",
+            model="facebook/bart-large-cnn",
+            tokenizer="facebook/bart-large-cnn",
+            device=0 if torch.cuda.is_available() else -1
+        )
+    return models
+models = load_models()
 # ----------------------------
+# DOCUMENT PROCESSING
 # ----------------------------
+def extract_text(file):
+    """Universal text extractor for PDF/DOCX"""
+    if file.type == "application/pdf":
+        reader = PdfReader(file)
+        return " ".join([page.extract_text() for page in reader.pages])
+    elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        doc = docx.Document(file)
+        return "\n".join(para.text for para in doc.paragraphs if para.text)
+    return ""
+# ----------------------------
+# SUMMARIZATION FUNCTION
+# ----------------------------
+def summarize(text, max_length=150, min_length=30):
+    """Advanced summarization with chunking for long documents"""
+    try:
+        if len(text.split()) > 1000:  # Chunking for large documents
+            chunks = [text[i:i+3000] for i in range(0, len(text), 3000)]
+            summaries = []
+            for chunk in chunks:
+                summary = models['summarizer'](
+                    chunk,
+                    max_length=max_length,
+                    min_length=min_length,
+                    do_sample=False
+                )
+                summaries.append(summary[0]['summary_text'])
+            return " ".join(summaries)
+        return models['summarizer'](text, max_length=max_length, min_length=min_length)[0]['summary_text']
+    except Exception as e:
+        st.error(f"Summarization error: {str(e)}")
+        return ""
 # ----------------------------
 # STREAMLIT UI
 # ----------------------------
+st.title("📚 Document Intelligence Suite")
+# Main Document Input
+with st.expander("📄 Upload Document", expanded=True):
+    uploaded_file = st.file_uploader("Choose PDF/DOCX", type=["pdf", "docx"])
+    manual_text = st.text_area("Or paste raw text here:", height=150)
+    context = extract_text(uploaded_file) if uploaded_file else manual_text
 # ----------------------------
+# ADVANCED FEATURES
 # ----------------------------
+with st.expander("🔧 Advanced Tools", expanded=False):
+    st.header("📝 Document Summarization")
+    if st.button("Generate Summary"):
+        if not context:
+            st.warning("Please provide content first")
+        else:
+            with st.spinner("Analyzing document..."):
+                start_time = time()
+                summary = summarize(context)
+                st.success(f"Generated in {time()-start_time:.1f}s")
+                st.markdown(f"**Summary:**\n\n{summary}")
+    st.header("⚙️ Customization")
+    max_len = st.slider("Summary Length", 50, 300, 150)
+    show_chunks = st.checkbox("Show processing chunks", False)
+# Question Answering Section
+if context:
+    st.header("❓ Question Answering")
+    question = st.text_input("Ask about the document:")
+    if question:
+        with st.spinner("Searching for answers..."):
+            result = models['qa'](question=question, context=context[:100000])  # 100k char limit
+            st.markdown(f"**Answer:** {result['answer']}")
+            st.caption(f"Confidence: {result['score']:.0%}")