Spaces:

amritn8
/

FINALLL

Sleeping

App Files Files Community

Amritpal Singh commited on Jun 12, 2025

Commit

e0e4765

1 Parent(s): 9a6ce9b

Initial commit: Streamlit app

Browse files

Files changed (10) hide show

.env +4 -0
Dockerfile +23 -10
README.md +1 -1
app.py +76 -64
qa_model/config.json +0 -25
qa_model/special_tokens_map.json +0 -7
qa_model/tokenizer.json +0 -0
qa_model/tokenizer_config.json +0 -56
qa_model/vocab.txt +0 -0
requirements.txt +6 -60

.env ADDED Viewed

	@@ -0,0 +1,4 @@

+TRANSFORMERS_CACHE=/tmp/model_cache
+HF_HOME=/tmp/huggingface
+STREAMLIT_SERVER_MAX_UPLOAD_SIZE=500
+STREAMLIT_SERVER_MAX_MESSAGE_SIZE=500

Dockerfile CHANGED Viewed

@@ -1,20 +1,33 @@
 FROM python:3.9-slim
 WORKDIR /app
-# Install git (optional, useful for huggingface model downloads)
-RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-# Copy requirements.txt and install dependencies with no cache to reduce image size
 COPY requirements.txt .
-RUN pip install --upgrade pip
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy all app files
-COPY . .
-# Expose Streamlit default port
 EXPOSE 8501
-# Run Streamlit app
-CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+# Use lightweight Python image
 FROM python:3.9-slim
+# Set up environment
 WORKDIR /app
+ENV PYTHONUNBUFFERED=1 \
+    TRANSFORMERS_CACHE=/app/model_cache \
+    HF_HOME=/app/model_cache
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc \
+    python3-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Create cache directory with write permissions
+RUN mkdir -p /app/model_cache && chmod 777 /app/model_cache
+# Copy only necessary files
 COPY requirements.txt .
+COPY app.py .
+# Install Python packages
+RUN pip install --no-cache-dir -r requirements.txt && \
+    python -c "from transformers import pipeline; pipeline('question-answering', model='distilbert-base-uncased-distilled-squad')"
+# Expose and run
 EXPOSE 8501
+HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8501/_stcore/health || exit 1
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: FINALLL
 emoji: 🚀
 colorFrom: red
 colorTo: red

 ---
+title: Final V1
 emoji: 🚀
 colorFrom: red
 colorTo: red

app.py CHANGED Viewed

@@ -1,80 +1,92 @@
 import streamlit as st
 import torch
-from transformers import AutoModelForQuestionAnswering, AutoTokenizer
-# Set page config
-st.set_page_config(page_title="BERT Question Answering System", layout="centered")
-# Load model and tokenizer from local directory
-@st.cache_resource
-def load_model():
-    model = AutoModelForQuestionAnswering.from_pretrained('qa_model')
-    tokenizer = AutoTokenizer.from_pretrained('qa_model')
-    return model, tokenizer
-model, tokenizer = load_model()
-# Function to get answer from model
-def get_answer(question, context):
-    inputs = tokenizer.encode_plus(
-        question, context,
-        return_tensors='pt',
-        max_length=512,
-        truncation=True
     )
-    input_ids = inputs['input_ids'].tolist()[0]
-    with torch.no_grad():
-        outputs = model(**inputs)
-    answer_start = torch.argmax(outputs.start_logits)
-    answer_end = torch.argmax(outputs.end_logits) + 1
-    answer = tokenizer.convert_tokens_to_string(
-        tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
-    )
-    return answer.strip()
-# App interface
-st.title("🤖 BERT Question Answering System")
-st.write("This app uses a locally hosted BERT model to answer questions based on the context you provide.")
-context = st.text_area("📄 Enter the context/passage:", height=200)
-question = st.text_input("❓ Ask a question about the context:")
-if st.button("Get Answer"):
-    if not context or not question:
-        st.warning("Please provide both a context and a question.")
     else:
-        try:
-            answer = get_answer(question, context)
-            if answer:
-                st.success(f"📄 Answer: {answer}")
-            else:
-                st.warning("No answer found in the given context.")
-        except Exception as e:
-            st.error(f"An error occurred: {str(e)}")
-# Add styling
-st.markdown("""
-<style>
-    .stTextInput input, .stTextArea textarea {
-        font-size: 16px !important;
-    }
-    .stButton button {
-        background-color: #4CAF50;
-        color: white;
-        font-weight: bold;
-        padding: 0.5rem 1rem;
-        border-radius: 5px;
-    }
-    .stButton button:hover {
-        background-color: #45a049;
-    }
-</style>
-""", unsafe_allow_html=True)
-# Footer
-st.markdown("---")
-st.markdown("Built with ❤️ using Streamlit and HuggingFace Transformers")

 import streamlit as st
 import torch
+import os
+from transformers import pipeline
+import fitz  # PyMuPDF
+import docx
+from time import time
+# Configure logging
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# ----------------------------
+# SETUP & MODEL LOAD
+# ----------------------------
+st.set_page_config(page_title="Fast QA App", layout="wide")
+st.title("🧠 Instant Question Answering")
+# Set cache directory
+cache_dir = os.path.join(os.getcwd(), "model_cache")
+os.makedirs(cache_dir, exist_ok=True)
+os.environ["TRANSFORMERS_CACHE"] = cache_dir
+# Load model with progress indicator
+@st.cache_resource(show_spinner="Loading AI model...")
+def load_qa_model():
+    logger.info(f"Loading model at {time()}")
+    return pipeline(
+        "question-answering",
+        model="distilbert-base-uncased-distilled-squad",  # Faster alternative
+        device=0 if torch.cuda.is_available() else -1
     )
+qa_pipeline = load_qa_model()
+st.success("Model loaded successfully!")
+# ----------------------------
+# TEXT EXTRACTION FUNCTIONS
+# ----------------------------
+def extract_text_from_pdf(uploaded_file):
+    with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
+        return " ".join(page.get_text() for page in doc)
+def extract_text_from_docx(uploaded_file):
+    doc = docx.Document(uploaded_file)
+    return "\n".join(para.text for para in doc.paragraphs if para.text)
+# ----------------------------
+# STREAMLIT UI
+# ----------------------------
+with st.form("qa_form"):
+    st.subheader("📄 Document Input")
+    uploaded_file = st.file_uploader("Upload PDF/DOCX", type=["pdf", "docx"])
+    manual_text = st.text_area("Or paste text here:", height=150)
+    st.subheader("❓ Question Input")
+    question = st.text_input("Enter your question:")
+    submit_btn = st.form_submit_button("Get Answer")
+if submit_btn:
+    context = ""
+    if uploaded_file:
+        file_type = uploaded_file.name.split(".")[-1].lower()
+        if file_type == "pdf":
+            context = extract_text_from_pdf(uploaded_file)
+        elif file_type == "docx":
+            context = extract_text_from_docx(uploaded_file)
     else:
+        context = manual_text
+    if not context:
+        st.warning("Please provide either a document or text input")
+    elif not question:
+        st.warning("Please enter a question")
+    else:
+        with st.spinner("Analyzing content..."):
+            try:
+                result = qa_pipeline(question=question, context=context[:10000])  # Limit context length
+                st.markdown(f"### ✅ Answer: {result['answer']}")
+                st.progress(result["score"])  # Show confidence score
+                st.caption(f"Confidence: {result['score']:.0%}")
+            except Exception as e:
+                st.error(f"Error processing request: {str(e)}")
+# ----------------------------
+# ADVANCED SECTION
+# ----------------------------
+with st.expander("⚙️ Advanced Options"):
+    st.subheader("Model Information")
+    st.code(f"Using: distilbert-base-uncased-distilled-squad")
+    st.caption("Optimized for fast inference on limited resources")

qa_model/config.json DELETED Viewed

@@ -1,25 +0,0 @@
-{
-  "architectures": [
-    "BertForQuestionAnswering"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "classifier_dropout": null,
-  "gradient_checkpointing": false,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 512,
-  "model_type": "bert",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pad_token_id": 0,
-  "position_embedding_type": "absolute",
-  "torch_dtype": "float32",
-  "transformers_version": "4.52.4",
-  "type_vocab_size": 2,
-  "use_cache": true,
-  "vocab_size": 30522
-}

qa_model/special_tokens_map.json DELETED Viewed

@@ -1,7 +0,0 @@
-{
-  "cls_token": "[CLS]",
-  "mask_token": "[MASK]",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "unk_token": "[UNK]"
-}

qa_model/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

qa_model/tokenizer_config.json DELETED Viewed

@@ -1,56 +0,0 @@
-{
-  "added_tokens_decoder": {
-    "0": {
-      "content": "[PAD]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "100": {
-      "content": "[UNK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "101": {
-      "content": "[CLS]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "102": {
-      "content": "[SEP]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "103": {
-      "content": "[MASK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "clean_up_tokenization_spaces": false,
-  "cls_token": "[CLS]",
-  "do_lower_case": true,
-  "extra_special_tokens": {},
-  "mask_token": "[MASK]",
-  "model_max_length": 512,
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "BertTokenizer",
-  "unk_token": "[UNK]"
-}

qa_model/vocab.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,60 +1,6 @@
-# Core Packages
-absl-py==2.1.0
-aiobotocore==2.12.0
-aiohttp==3.8.6
-altair==4.2.2
-astunparse==1.6.3
-atomicwrites==1.4.0
-attrs==23.1.0
-black==23.9.1
-bokeh==2.4.3
-certifi==2023.7.22
-clarabel==0.10.0
-click==8.1.7
-cvxpy==1.6.5
-cycler==0.12.1
-fastjsonschema==2.18.1
-findspark==2.0.1
-flatbuffers==24.3.25
-fontawesomefree==6.6.0
-gast==0.6.0
-google-pasta==0.2.0
-grpcio==1.68.0
-huggingface-hub==0.26.2
-immutabledict==4.2.1
-keras==3.6.0
-lxml==5.2.1
-matplotlib==3.9.2
-mkl-service==2.4.0
-ml-dtypes==0.4.1
-multitasking==0.0.11
-numpy==1.23.5
-opt-einsum==3.4.0
-optbinning==0.20.1
-optree==0.13.1
-ortools==9.11.4210
-osqp==1.0.4
-pandas==1.5.3
-peewee==3.17.8
-protobuf==5.26.1
-pyarrow==14.0.2
-PyQt5==5.15.10
-PyQtWebEngine==5.15.6
-pywin32==305.1
-pywaffle==1.1.1
-scikit-learn==1.2.2
-scipy==1.10.1
-scs==3.2.7.post2
-setuptools==75.1.0
-sympy==1.13.1
-tensorboard==2.18.0
-tensorflow==2.18.0
-tensorflow-intel==2.18.0
-termcolor==2.5.0
-tokenizers==0.20.3
-torch==2.5.1
-transformers==4.46.3
-wheel==0.44.0
-wordcloud==1.9.4
-xgboost==3.0.1
-yfinance==0.2.50

+streamlit>=1.28.0
+transformers>=4.30.0
+torch>=2.0.0
+python-docx>=0.8.0
+pymupdf>=1.22.0
+tqdm>=4.0.0  # For better download progress