amritn8 commited on
Commit
d80c9f5
Β·
verified Β·
1 Parent(s): 56735e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -70
app.py CHANGED
@@ -1,92 +1,116 @@
1
  import streamlit as st
2
  import torch
3
- import os
4
- from transformers import pipeline
5
- import fitz # PyMuPDF
6
  import docx
 
7
  from time import time
8
 
9
- # Configure logging
10
- import logging
11
- logging.basicConfig(level=logging.INFO)
12
- logger = logging.getLogger(__name__)
13
-
14
- # ----------------------------
15
- # SETUP & MODEL LOAD
16
- # ----------------------------
17
- st.set_page_config(page_title="Fast QA App", layout="wide")
18
- st.title("🧠 Instant Question Answering")
19
-
20
- # Set cache directory
21
  cache_dir = os.path.join(os.getcwd(), "model_cache")
22
  os.makedirs(cache_dir, exist_ok=True)
23
  os.environ["TRANSFORMERS_CACHE"] = cache_dir
24
 
25
- # Load model with progress indicator
26
- @st.cache_resource(show_spinner="Loading AI model...")
27
- def load_qa_model():
28
- logger.info(f"Loading model at {time()}")
29
- return pipeline(
30
- "question-answering",
31
- model="distilbert-base-uncased-distilled-squad", # Faster alternative
32
- device=0 if torch.cuda.is_available() else -1
33
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- qa_pipeline = load_qa_model()
36
- st.success("Model loaded successfully!")
37
 
38
  # ----------------------------
39
- # TEXT EXTRACTION FUNCTIONS
40
  # ----------------------------
41
- def extract_text_from_pdf(uploaded_file):
42
- with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
43
- return " ".join(page.get_text() for page in doc)
 
 
 
 
 
 
44
 
45
- def extract_text_from_docx(uploaded_file):
46
- doc = docx.Document(uploaded_file)
47
- return "\n".join(para.text for para in doc.paragraphs if para.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  # ----------------------------
50
  # STREAMLIT UI
51
  # ----------------------------
52
- with st.form("qa_form"):
53
- st.subheader("πŸ“„ Document Input")
54
- uploaded_file = st.file_uploader("Upload PDF/DOCX", type=["pdf", "docx"])
55
- manual_text = st.text_area("Or paste text here:", height=150)
56
 
57
- st.subheader("❓ Question Input")
58
- question = st.text_input("Enter your question:")
59
- submit_btn = st.form_submit_button("Get Answer")
60
-
61
- if submit_btn:
62
- context = ""
63
- if uploaded_file:
64
- file_type = uploaded_file.name.split(".")[-1].lower()
65
- if file_type == "pdf":
66
- context = extract_text_from_pdf(uploaded_file)
67
- elif file_type == "docx":
68
- context = extract_text_from_docx(uploaded_file)
69
- else:
70
- context = manual_text
71
-
72
- if not context:
73
- st.warning("Please provide either a document or text input")
74
- elif not question:
75
- st.warning("Please enter a question")
76
- else:
77
- with st.spinner("Analyzing content..."):
78
- try:
79
- result = qa_pipeline(question=question, context=context[:10000]) # Limit context length
80
- st.markdown(f"### βœ… Answer: {result['answer']}")
81
- st.progress(result["score"]) # Show confidence score
82
- st.caption(f"Confidence: {result['score']:.0%}")
83
- except Exception as e:
84
- st.error(f"Error processing request: {str(e)}")
85
 
86
  # ----------------------------
87
- # ADVANCED SECTION
88
  # ----------------------------
89
- with st.expander("βš™οΈ Advanced Options"):
90
- st.subheader("Model Information")
91
- st.code(f"Using: distilbert-base-uncased-distilled-squad")
92
- st.caption("Optimized for fast inference on limited resources")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import torch
3
+ from transformers import pipeline, BartForConditionalGeneration, BartTokenizer
4
+ from PyPDF2 import PdfReader
 
5
  import docx
6
+ import os
7
  from time import time
8
 
9
+ # Configure environment
 
 
 
 
 
 
 
 
 
 
 
10
  cache_dir = os.path.join(os.getcwd(), "model_cache")
11
  os.makedirs(cache_dir, exist_ok=True)
12
  os.environ["TRANSFORMERS_CACHE"] = cache_dir
13
 
14
+ # ----------------------------
15
+ # MODEL LOADING
16
+ # ----------------------------
17
+ @st.cache_resource(show_spinner=False)
18
+ def load_models():
19
+ """Load all models with progress tracking"""
20
+ models = {}
21
+
22
+ with st.spinner("πŸš€ Loading QA Model..."):
23
+ models['qa'] = pipeline(
24
+ "question-answering",
25
+ model="deepset/roberta-base-squad2",
26
+ device=0 if torch.cuda.is_available() else -1
27
+ )
28
+
29
+ with st.spinner("πŸ“ Loading Summarization Model..."):
30
+ models['summarizer'] = pipeline(
31
+ "summarization",
32
+ model="facebook/bart-large-cnn",
33
+ tokenizer="facebook/bart-large-cnn",
34
+ device=0 if torch.cuda.is_available() else -1
35
+ )
36
+
37
+ return models
38
 
39
+ models = load_models()
 
40
 
41
  # ----------------------------
42
+ # DOCUMENT PROCESSING
43
  # ----------------------------
44
+ def extract_text(file):
45
+ """Universal text extractor for PDF/DOCX"""
46
+ if file.type == "application/pdf":
47
+ reader = PdfReader(file)
48
+ return " ".join([page.extract_text() for page in reader.pages])
49
+ elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
50
+ doc = docx.Document(file)
51
+ return "\n".join(para.text for para in doc.paragraphs if para.text)
52
+ return ""
53
 
54
+ # ----------------------------
55
+ # SUMMARIZATION FUNCTION
56
+ # ----------------------------
57
+ def summarize(text, max_length=150, min_length=30):
58
+ """Advanced summarization with chunking for long documents"""
59
+ try:
60
+ if len(text.split()) > 1000: # Chunking for large documents
61
+ chunks = [text[i:i+3000] for i in range(0, len(text), 3000)]
62
+ summaries = []
63
+ for chunk in chunks:
64
+ summary = models['summarizer'](
65
+ chunk,
66
+ max_length=max_length,
67
+ min_length=min_length,
68
+ do_sample=False
69
+ )
70
+ summaries.append(summary[0]['summary_text'])
71
+ return " ".join(summaries)
72
+ return models['summarizer'](text, max_length=max_length, min_length=min_length)[0]['summary_text']
73
+ except Exception as e:
74
+ st.error(f"Summarization error: {str(e)}")
75
+ return ""
76
 
77
  # ----------------------------
78
  # STREAMLIT UI
79
  # ----------------------------
80
+ st.title("πŸ“š Document Intelligence Suite")
 
 
 
81
 
82
+ # Main Document Input
83
+ with st.expander("πŸ“„ Upload Document", expanded=True):
84
+ uploaded_file = st.file_uploader("Choose PDF/DOCX", type=["pdf", "docx"])
85
+ manual_text = st.text_area("Or paste raw text here:", height=150)
86
+ context = extract_text(uploaded_file) if uploaded_file else manual_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  # ----------------------------
89
+ # ADVANCED FEATURES
90
  # ----------------------------
91
+ with st.expander("πŸ”§ Advanced Tools", expanded=False):
92
+ st.header("πŸ“ Document Summarization")
93
+
94
+ if st.button("Generate Summary"):
95
+ if not context:
96
+ st.warning("Please provide content first")
97
+ else:
98
+ with st.spinner("Analyzing document..."):
99
+ start_time = time()
100
+ summary = summarize(context)
101
+ st.success(f"Generated in {time()-start_time:.1f}s")
102
+ st.markdown(f"**Summary:**\n\n{summary}")
103
+
104
+ st.header("βš™οΈ Customization")
105
+ max_len = st.slider("Summary Length", 50, 300, 150)
106
+ show_chunks = st.checkbox("Show processing chunks", False)
107
+
108
+ # Question Answering Section
109
+ if context:
110
+ st.header("❓ Question Answering")
111
+ question = st.text_input("Ask about the document:")
112
+ if question:
113
+ with st.spinner("Searching for answers..."):
114
+ result = models['qa'](question=question, context=context[:100000]) # 100k char limit
115
+ st.markdown(f"**Answer:** {result['answer']}")
116
+ st.caption(f"Confidence: {result['score']:.0%}")