Ginidu2003 commited on
Commit
620df90
·
verified ·
1 Parent(s): 74452f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -13
app.py CHANGED
@@ -16,7 +16,7 @@ import io
16
  nltk.download('wordnet', quiet=True)
17
  nltk.download('punkt', quiet=True)
18
  nltk.download('punkt_tab', quiet=True)
19
-
20
 
21
  lemmatizer = WordNetLemmatizer()
22
 
@@ -80,13 +80,9 @@ from transformers import AutoTokenizer, AutoModelForQuestionAnswering
80
  qa_tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
81
  qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
82
 
83
- from wordcloud import WordCloud
84
- import matplotlib.pyplot as plt
85
- import io
86
-
87
  def answer_question(news_content, question):
88
  if not news_content.strip() or not question.strip():
89
- return "Please enter both news content and a question."
90
 
91
  try:
92
  inputs = qa_tokenizer(question, news_content, return_tensors="pt", truncation=True, max_length=512)
@@ -97,21 +93,18 @@ def answer_question(news_content, question):
97
  start_idx = torch.argmax(outputs.start_logits)
98
  end_idx = torch.argmax(outputs.end_logits) + 1
99
 
100
- # Clean answer - remove question repetition and special tokens
101
  answer = qa_tokenizer.decode(inputs.input_ids[0][start_idx:end_idx],
102
  skip_special_tokens=True,
103
  clean_up_tokenization_spaces=True)
104
 
105
  confidence = torch.max(torch.softmax(outputs.start_logits, dim=1)).item()
106
-
107
-
108
 
109
  # ====== Useful Statistics ======
110
  words = nltk.word_tokenize(news_content)
111
  sentences = nltk.sent_tokenize(news_content)
112
  word_count = len(words)
113
  sentence_count = len(sentences)
114
- reading_time = round(word_count / 200, 2) # ~200 words/minute
115
 
116
  stats = f"Word count: {word_count}\nSentences: {sentence_count}\nEstimated reading time: {reading_time} min"
117
 
@@ -122,10 +115,15 @@ def answer_question(news_content, question):
122
  ax.axis("off")
123
  buf = io.BytesIO()
124
  plt.savefig(buf, format="png")
 
125
  buf.seek(0)
126
- return f"**Answer:** {answer.strip()}\n\n**Confidence:** {confidence:.2%}\n\n{stats}", buf
 
 
 
 
127
  except Exception as e:
128
- return f"Error: {str(e)}"
129
 
130
 
131
  # ====================== GRADIO INTERFACE ======================
@@ -158,13 +156,14 @@ with gr.Blocks(title="Daily Mirror News Classifier") as demo:
158
  news_input = gr.Textbox(lines=12, label="Paste News Content", placeholder="Paste the full news article here...")
159
  question_input = gr.Textbox(label="Your Question", placeholder="e.g. What is the main topic?")
160
  qa_btn = gr.Button("🔍 Get Answer", variant="primary")
 
161
  qa_output = gr.Textbox(label="Answer + Stats", lines=8)
162
  wc_output = gr.Image(label="Word Cloud")
163
 
164
  qa_btn.click(
165
  fn=answer_question,
166
  inputs=[news_input, question_input],
167
- outputs=[qa_output,wc_output]
168
  )
169
 
170
  gr.Markdown("Built for Text Analytics Assignment - Section 02")
 
16
  nltk.download('wordnet', quiet=True)
17
  nltk.download('punkt', quiet=True)
18
  nltk.download('punkt_tab', quiet=True)
19
+ nltk.download('averaged_perceptron_tagger', quiet=True)
20
 
21
  lemmatizer = WordNetLemmatizer()
22
 
 
80
  qa_tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
81
  qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
82
 
 
 
 
 
83
  def answer_question(news_content, question):
84
  if not news_content.strip() or not question.strip():
85
+ return "Please enter both news content and a question.", None
86
 
87
  try:
88
  inputs = qa_tokenizer(question, news_content, return_tensors="pt", truncation=True, max_length=512)
 
93
  start_idx = torch.argmax(outputs.start_logits)
94
  end_idx = torch.argmax(outputs.end_logits) + 1
95
 
 
96
  answer = qa_tokenizer.decode(inputs.input_ids[0][start_idx:end_idx],
97
  skip_special_tokens=True,
98
  clean_up_tokenization_spaces=True)
99
 
100
  confidence = torch.max(torch.softmax(outputs.start_logits, dim=1)).item()
 
 
101
 
102
  # ====== Useful Statistics ======
103
  words = nltk.word_tokenize(news_content)
104
  sentences = nltk.sent_tokenize(news_content)
105
  word_count = len(words)
106
  sentence_count = len(sentences)
107
+ reading_time = round(word_count / 200, 2)
108
 
109
  stats = f"Word count: {word_count}\nSentences: {sentence_count}\nEstimated reading time: {reading_time} min"
110
 
 
115
  ax.axis("off")
116
  buf = io.BytesIO()
117
  plt.savefig(buf, format="png")
118
+ plt.close(fig) # Important
119
  buf.seek(0)
120
+
121
+ full_output = f"**Answer:** {answer.strip()}\n\n**Confidence:** {confidence:.2%}\n\n{stats}"
122
+
123
+ return full_output, buf
124
+
125
  except Exception as e:
126
+ return f"Error: {str(e)}", None
127
 
128
 
129
  # ====================== GRADIO INTERFACE ======================
 
156
  news_input = gr.Textbox(lines=12, label="Paste News Content", placeholder="Paste the full news article here...")
157
  question_input = gr.Textbox(label="Your Question", placeholder="e.g. What is the main topic?")
158
  qa_btn = gr.Button("🔍 Get Answer", variant="primary")
159
+
160
  qa_output = gr.Textbox(label="Answer + Stats", lines=8)
161
  wc_output = gr.Image(label="Word Cloud")
162
 
163
  qa_btn.click(
164
  fn=answer_question,
165
  inputs=[news_input, question_input],
166
+ outputs=[qa_output, wc_output]
167
  )
168
 
169
  gr.Markdown("Built for Text Analytics Assignment - Section 02")