Prageeth-1 commited on
Commit
9bc69b8
·
verified ·
1 Parent(s): 3d13f8a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -42
app.py CHANGED
@@ -3,51 +3,17 @@ import pandas as pd
3
  import numpy as np
4
  import re
5
  import nltk
6
- import os
7
  from nltk.corpus import stopwords
8
  from nltk.stem import WordNetLemmatizer
9
- from nltk.tokenize import word_tokenize
10
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
11
  from wordcloud import WordCloud
12
  import matplotlib.pyplot as plt
13
  import io
14
 
15
- st.set_page_config(
16
- page_title="News Analysis Dashboard",
17
- page_icon="📰",
18
- layout="wide",
19
- initial_sidebar_state="expanded"
20
- )
21
-
22
- st.cache_resource
23
- def setup_nltk():
24
- # Set NLTK data path
25
- nltk_data_path = os.path.join(os.getcwd(), 'nltk_data')
26
- os.makedirs(nltk_data_path, exist_ok=True)
27
- nltk.data.path.append(nltk_data_path)
28
-
29
- # Download required resources with retries
30
- required = ['punkt', 'stopwords', 'wordnet', 'omw-1.4']
31
- for resource in required:
32
- max_retries = 3
33
- for attempt in range(max_retries):
34
- try:
35
- nltk.data.find(f'tokenizers/punkt/PY3/english.pickle')
36
- break
37
- except LookupError:
38
- try:
39
- nltk.download(resource, download_dir=nltk_data_path)
40
- # Special handling for punkt
41
- if resource == 'punkt':
42
- nltk.download('punkt_tab', download_dir=nltk_data_path)
43
- except Exception as e:
44
- if attempt == max_retries - 1:
45
- st.error(f"Failed to download NLTK resource {resource} after {max_retries} attempts")
46
- raise
47
- continue
48
-
49
- # Run initialization before anything else
50
- setup_nltk()
51
 
52
  # Initialize lemmatizer
53
  lemmatizer = WordNetLemmatizer()
@@ -91,7 +57,13 @@ def generate_wordcloud(text, title=None):
91
  plt.title(title, fontsize=20)
92
  st.pyplot(plt)
93
 
94
-
 
 
 
 
 
 
95
 
96
  # Custom CSS
97
  st.markdown("""
@@ -143,11 +115,9 @@ with tab1:
143
 
144
  # Load the fine-tuned news classifier
145
  classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")
146
-
147
- df['preprocessed_text'] = df['content'].apply(preprocess_text)
148
 
149
  # Classify each article and store the predictions
150
- df["predicted_category"] = df["preprocessed_text"].apply(lambda text: classifier(text)[0]["label"])
151
 
152
  # Preprocess and classify
153
 
 
3
  import numpy as np
4
  import re
5
  import nltk
 
6
  from nltk.corpus import stopwords
7
  from nltk.stem import WordNetLemmatizer
 
8
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
9
  from wordcloud import WordCloud
10
  import matplotlib.pyplot as plt
11
  import io
12
 
13
+ # Download NLTK resources
14
+ nltk.download('punkt')
15
+ nltk.download('stopwords')
16
+ nltk.download('wordnet')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  # Initialize lemmatizer
19
  lemmatizer = WordNetLemmatizer()
 
57
  plt.title(title, fontsize=20)
58
  st.pyplot(plt)
59
 
60
+ # Set page config
61
+ st.set_page_config(
62
+ page_title="News Analysis Dashboard",
63
+ page_icon="📰",
64
+ layout="wide",
65
+ initial_sidebar_state="expanded"
66
+ )
67
 
68
  # Custom CSS
69
  st.markdown("""
 
115
 
116
  # Load the fine-tuned news classifier
117
  classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")
 
 
118
 
119
  # Classify each article and store the predictions
120
+ df["predicted_category"] = df["content"].apply(lambda text: classifier(text)[0]["label"])
121
 
122
  # Preprocess and classify
123