Amritpal Singh commited on
Commit
e0e4765
·
1 Parent(s): 9a6ce9b

Initial commit: Streamlit app

Browse files
.env ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ TRANSFORMERS_CACHE=/tmp/model_cache
2
+ HF_HOME=/tmp/huggingface
3
+ STREAMLIT_SERVER_MAX_UPLOAD_SIZE=500
4
+ STREAMLIT_SERVER_MAX_MESSAGE_SIZE=500
Dockerfile CHANGED
@@ -1,20 +1,33 @@
 
1
  FROM python:3.9-slim
2
 
 
3
  WORKDIR /app
 
 
 
4
 
5
- # Install git (optional, useful for huggingface model downloads)
6
- RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 
 
 
 
7
 
8
- # Copy requirements.txt and install dependencies with no cache to reduce image size
 
 
 
9
  COPY requirements.txt .
10
- RUN pip install --upgrade pip
11
- RUN pip install --no-cache-dir -r requirements.txt
12
 
13
- # Copy all app files
14
- COPY . .
 
15
 
16
- # Expose Streamlit default port
17
  EXPOSE 8501
 
 
18
 
19
- # Run Streamlit app
20
- CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
1
+ # Use lightweight Python image
2
  FROM python:3.9-slim
3
 
4
+ # Set up environment
5
  WORKDIR /app
6
+ ENV PYTHONUNBUFFERED=1 \
7
+ TRANSFORMERS_CACHE=/app/model_cache \
8
+ HF_HOME=/app/model_cache
9
 
10
+ # Install system dependencies
11
+ RUN apt-get update && \
12
+ apt-get install -y --no-install-recommends \
13
+ gcc \
14
+ python3-dev \
15
+ && rm -rf /var/lib/apt/lists/*
16
 
17
+ # Create cache directory with write permissions
18
+ RUN mkdir -p /app/model_cache && chmod 777 /app/model_cache
19
+
20
+ # Copy only necessary files
21
  COPY requirements.txt .
22
+ COPY app.py .
 
23
 
24
+ # Install Python packages
25
+ RUN pip install --no-cache-dir -r requirements.txt && \
26
+ python -c "from transformers import pipeline; pipeline('question-answering', model='distilbert-base-uncased-distilled-squad')"
27
 
28
+ # Expose and run
29
  EXPOSE 8501
30
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
31
+ CMD curl -f http://localhost:8501/_stcore/health || exit 1
32
 
33
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: FINALLL
3
  emoji: 🚀
4
  colorFrom: red
5
  colorTo: red
 
1
  ---
2
+ title: Final V1
3
  emoji: 🚀
4
  colorFrom: red
5
  colorTo: red
app.py CHANGED
@@ -1,80 +1,92 @@
1
  import streamlit as st
2
  import torch
3
- from transformers import AutoModelForQuestionAnswering, AutoTokenizer
 
 
 
 
4
 
5
- # Set page config
6
- st.set_page_config(page_title="BERT Question Answering System", layout="centered")
 
 
7
 
8
- # Load model and tokenizer from local directory
9
- @st.cache_resource
10
- def load_model():
11
- model = AutoModelForQuestionAnswering.from_pretrained('qa_model')
12
- tokenizer = AutoTokenizer.from_pretrained('qa_model')
13
- return model, tokenizer
14
 
15
- model, tokenizer = load_model()
 
 
 
16
 
17
- # Function to get answer from model
18
- def get_answer(question, context):
19
- inputs = tokenizer.encode_plus(
20
- question, context,
21
- return_tensors='pt',
22
- max_length=512,
23
- truncation=True
 
24
  )
25
- input_ids = inputs['input_ids'].tolist()[0]
26
 
27
- with torch.no_grad():
28
- outputs = model(**inputs)
29
 
30
- answer_start = torch.argmax(outputs.start_logits)
31
- answer_end = torch.argmax(outputs.end_logits) + 1
 
 
 
 
32
 
33
- answer = tokenizer.convert_tokens_to_string(
34
- tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
35
- )
36
-
37
- return answer.strip()
38
 
39
- # App interface
40
- st.title("🤖 BERT Question Answering System")
41
- st.write("This app uses a locally hosted BERT model to answer questions based on the context you provide.")
 
 
 
 
42
 
43
- context = st.text_area("📄 Enter the context/passage:", height=200)
44
- question = st.text_input(" Ask a question about the context:")
 
45
 
46
- if st.button("Get Answer"):
47
- if not context or not question:
48
- st.warning("Please provide both a context and a question.")
 
 
 
 
 
49
  else:
50
- try:
51
- answer = get_answer(question, context)
52
- if answer:
53
- st.success(f"📄 Answer: {answer}")
54
- else:
55
- st.warning("No answer found in the given context.")
56
- except Exception as e:
57
- st.error(f"An error occurred: {str(e)}")
58
 
59
- # Add styling
60
- st.markdown("""
61
- <style>
62
- .stTextInput input, .stTextArea textarea {
63
- font-size: 16px !important;
64
- }
65
- .stButton button {
66
- background-color: #4CAF50;
67
- color: white;
68
- font-weight: bold;
69
- padding: 0.5rem 1rem;
70
- border-radius: 5px;
71
- }
72
- .stButton button:hover {
73
- background-color: #45a049;
74
- }
75
- </style>
76
- """, unsafe_allow_html=True)
77
 
78
- # Footer
79
- st.markdown("---")
80
- st.markdown("Built with ❤️ using Streamlit and HuggingFace Transformers")
 
 
 
 
 
1
  import streamlit as st
2
  import torch
3
+ import os
4
+ from transformers import pipeline
5
+ import fitz # PyMuPDF
6
+ import docx
7
+ from time import time
8
 
9
+ # Configure logging
10
+ import logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
 
14
+ # ----------------------------
15
+ # SETUP & MODEL LOAD
16
+ # ----------------------------
17
+ st.set_page_config(page_title="Fast QA App", layout="wide")
18
+ st.title("🧠 Instant Question Answering")
 
19
 
20
+ # Set cache directory
21
+ cache_dir = os.path.join(os.getcwd(), "model_cache")
22
+ os.makedirs(cache_dir, exist_ok=True)
23
+ os.environ["TRANSFORMERS_CACHE"] = cache_dir
24
 
25
+ # Load model with progress indicator
26
+ @st.cache_resource(show_spinner="Loading AI model...")
27
+ def load_qa_model():
28
+ logger.info(f"Loading model at {time()}")
29
+ return pipeline(
30
+ "question-answering",
31
+ model="distilbert-base-uncased-distilled-squad", # Faster alternative
32
+ device=0 if torch.cuda.is_available() else -1
33
  )
 
34
 
35
+ qa_pipeline = load_qa_model()
36
+ st.success("Model loaded successfully!")
37
 
38
+ # ----------------------------
39
+ # TEXT EXTRACTION FUNCTIONS
40
+ # ----------------------------
41
+ def extract_text_from_pdf(uploaded_file):
42
+ with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
43
+ return " ".join(page.get_text() for page in doc)
44
 
45
+ def extract_text_from_docx(uploaded_file):
46
+ doc = docx.Document(uploaded_file)
47
+ return "\n".join(para.text for para in doc.paragraphs if para.text)
 
 
48
 
49
+ # ----------------------------
50
+ # STREAMLIT UI
51
+ # ----------------------------
52
+ with st.form("qa_form"):
53
+ st.subheader("📄 Document Input")
54
+ uploaded_file = st.file_uploader("Upload PDF/DOCX", type=["pdf", "docx"])
55
+ manual_text = st.text_area("Or paste text here:", height=150)
56
 
57
+ st.subheader(" Question Input")
58
+ question = st.text_input("Enter your question:")
59
+ submit_btn = st.form_submit_button("Get Answer")
60
 
61
+ if submit_btn:
62
+ context = ""
63
+ if uploaded_file:
64
+ file_type = uploaded_file.name.split(".")[-1].lower()
65
+ if file_type == "pdf":
66
+ context = extract_text_from_pdf(uploaded_file)
67
+ elif file_type == "docx":
68
+ context = extract_text_from_docx(uploaded_file)
69
  else:
70
+ context = manual_text
 
 
 
 
 
 
 
71
 
72
+ if not context:
73
+ st.warning("Please provide either a document or text input")
74
+ elif not question:
75
+ st.warning("Please enter a question")
76
+ else:
77
+ with st.spinner("Analyzing content..."):
78
+ try:
79
+ result = qa_pipeline(question=question, context=context[:10000]) # Limit context length
80
+ st.markdown(f"### ✅ Answer: {result['answer']}")
81
+ st.progress(result["score"]) # Show confidence score
82
+ st.caption(f"Confidence: {result['score']:.0%}")
83
+ except Exception as e:
84
+ st.error(f"Error processing request: {str(e)}")
 
 
 
 
 
85
 
86
+ # ----------------------------
87
+ # ADVANCED SECTION
88
+ # ----------------------------
89
+ with st.expander("⚙️ Advanced Options"):
90
+ st.subheader("Model Information")
91
+ st.code(f"Using: distilbert-base-uncased-distilled-squad")
92
+ st.caption("Optimized for fast inference on limited resources")
qa_model/config.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "architectures": [
3
- "BertForQuestionAnswering"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "classifier_dropout": null,
7
- "gradient_checkpointing": false,
8
- "hidden_act": "gelu",
9
- "hidden_dropout_prob": 0.1,
10
- "hidden_size": 768,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 3072,
13
- "layer_norm_eps": 1e-12,
14
- "max_position_embeddings": 512,
15
- "model_type": "bert",
16
- "num_attention_heads": 12,
17
- "num_hidden_layers": 12,
18
- "pad_token_id": 0,
19
- "position_embedding_type": "absolute",
20
- "torch_dtype": "float32",
21
- "transformers_version": "4.52.4",
22
- "type_vocab_size": 2,
23
- "use_cache": true,
24
- "vocab_size": 30522
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
qa_model/special_tokens_map.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "cls_token": "[CLS]",
3
- "mask_token": "[MASK]",
4
- "pad_token": "[PAD]",
5
- "sep_token": "[SEP]",
6
- "unk_token": "[UNK]"
7
- }
 
 
 
 
 
 
 
 
qa_model/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
qa_model/tokenizer_config.json DELETED
@@ -1,56 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "[PAD]",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "100": {
12
- "content": "[UNK]",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "101": {
20
- "content": "[CLS]",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "102": {
28
- "content": "[SEP]",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "103": {
36
- "content": "[MASK]",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- }
43
- },
44
- "clean_up_tokenization_spaces": false,
45
- "cls_token": "[CLS]",
46
- "do_lower_case": true,
47
- "extra_special_tokens": {},
48
- "mask_token": "[MASK]",
49
- "model_max_length": 512,
50
- "pad_token": "[PAD]",
51
- "sep_token": "[SEP]",
52
- "strip_accents": null,
53
- "tokenize_chinese_chars": true,
54
- "tokenizer_class": "BertTokenizer",
55
- "unk_token": "[UNK]"
56
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
qa_model/vocab.txt DELETED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1,60 +1,6 @@
1
- # Core Packages
2
- absl-py==2.1.0
3
- aiobotocore==2.12.0
4
- aiohttp==3.8.6
5
- altair==4.2.2
6
- astunparse==1.6.3
7
- atomicwrites==1.4.0
8
- attrs==23.1.0
9
- black==23.9.1
10
- bokeh==2.4.3
11
- certifi==2023.7.22
12
- clarabel==0.10.0
13
- click==8.1.7
14
- cvxpy==1.6.5
15
- cycler==0.12.1
16
- fastjsonschema==2.18.1
17
- findspark==2.0.1
18
- flatbuffers==24.3.25
19
- fontawesomefree==6.6.0
20
- gast==0.6.0
21
- google-pasta==0.2.0
22
- grpcio==1.68.0
23
- huggingface-hub==0.26.2
24
- immutabledict==4.2.1
25
- keras==3.6.0
26
- lxml==5.2.1
27
- matplotlib==3.9.2
28
- mkl-service==2.4.0
29
- ml-dtypes==0.4.1
30
- multitasking==0.0.11
31
- numpy==1.23.5
32
- opt-einsum==3.4.0
33
- optbinning==0.20.1
34
- optree==0.13.1
35
- ortools==9.11.4210
36
- osqp==1.0.4
37
- pandas==1.5.3
38
- peewee==3.17.8
39
- protobuf==5.26.1
40
- pyarrow==14.0.2
41
- PyQt5==5.15.10
42
- PyQtWebEngine==5.15.6
43
- pywin32==305.1
44
- pywaffle==1.1.1
45
- scikit-learn==1.2.2
46
- scipy==1.10.1
47
- scs==3.2.7.post2
48
- setuptools==75.1.0
49
- sympy==1.13.1
50
- tensorboard==2.18.0
51
- tensorflow==2.18.0
52
- tensorflow-intel==2.18.0
53
- termcolor==2.5.0
54
- tokenizers==0.20.3
55
- torch==2.5.1
56
- transformers==4.46.3
57
- wheel==0.44.0
58
- wordcloud==1.9.4
59
- xgboost==3.0.1
60
- yfinance==0.2.50
 
1
+ streamlit>=1.28.0
2
+ transformers>=4.30.0
3
+ torch>=2.0.0
4
+ python-docx>=0.8.0
5
+ pymupdf>=1.22.0
6
+ tqdm>=4.0.0 # For better download progress