Spaces:

Mpavan45
/

News_Classifier

Sleeping

App Files Files Community

Mpavan45 commited on Mar 19, 2025

Commit

b0a4e3e

verified ·

1 Parent(s): c3da79e

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -29

app.py CHANGED Viewed

@@ -183,37 +183,27 @@ st.markdown(
 st.markdown("<div class='prompt-box'>Paste the article content below to analyze its category with PressGuard🛡️</div>", unsafe_allow_html=True)
-# Check if NLTK resources are already downloaded
-nltk_data_path = os.path.expanduser('~/nltk_data')
 if not os.path.exists(nltk_data_path):
     os.makedirs(nltk_data_path)
-try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt', download_dir=nltk_data_path)
-try:
-    nltk.data.find('corpora/stopwords')
-except LookupError:
-    nltk.download('stopwords', download_dir=nltk_data_path)
-try:
-    nltk.data.find('corpora/wordnet')
-except LookupError:
-    nltk.download('wordnet', download_dir=nltk_data_path)
 # Initialize stopwords and lemmatizer
 stop_words = set(stopwords.words('english')).union({"pm"})
 lemmatizer = WordNetLemmatizer()
-import nltk
-nltk.download('punkt')
-nltk.download('stopwords')
-nltk.download('wordnet')
-# Preprocessing Function
 def pre_process(x):
     x = x.lower()
     x = re.sub("<.*?>", "", x)
@@ -224,32 +214,55 @@ def pre_process(x):
     x = emoji.demojize(x)
     x = re.sub(":.*?:", "", x)
     x = re.sub("[^a-zA-Z0-9\\s_]", "", x)
     words = word_tokenize(x)
     words = [word for word in words if word not in stop_words]
     x = " ".join([lemmatizer.lemmatize(word) for word in words])
     return x
-# Load
-rm -rf /home/user/nltk_data
 @st.cache_resource
 def load_model():
-    model = keras.models.load_model("model_m3_new.keras")
-    vectorizer = keras.models.load_model("vec_text_m3_new.keras")
     with open("label_encoder_m5.pkl", 'rb') as file:
         label_encoder = pickle.load(file)
     return model, vectorizer, label_encoder
 model, vectorizer, label_encoder = load_model()
-# Prediction Function
 def predict_category(text):
     processed_text = [pre_process(text)]
-    text_vectorized = pad_sequences(vectorizer(processed_text).numpy().tolist(), padding='pre', maxlen=128)
     prediction = model.predict(text_vectorized)
     category_idx = np.argmax(prediction, axis=1)[0]
     return label_encoder.inverse_transform([category_idx])[0]
-# User Input
 input_text = st.text_area("Enter News Article:", height=200)
 if st.button("Analyze", key="analyze-btn", help="Click to classify the news article"):
@@ -257,4 +270,4 @@ if st.button("Analyze", key="analyze-btn", help="Click to classify the news arti
         category = predict_category(input_text)
         st.markdown(f"<div class='result-box'>Predicted Category: {category}</div>", unsafe_allow_html=True)
     else:
-        st.warning("Please enter some text to analyze.")

 st.markdown("<div class='prompt-box'>Paste the article content below to analyze its category with PressGuard🛡️</div>", unsafe_allow_html=True)
+# Ensure NLTK resources are downloaded in the correct directory
+nltk_data_path = '/root/nltk_data'  # Use the correct path in Hugging Face Spaces
 if not os.path.exists(nltk_data_path):
     os.makedirs(nltk_data_path)
+# Download NLTK resources only if not already present
+for resource in ['punkt', 'stopwords', 'wordnet']:
+    try:
+        nltk.data.find(f'tokenizers/{resource}' if resource == 'punkt' else f'corpora/{resource}')
+    except LookupError:
+        nltk.download(resource, download_dir=nltk_data_path)
+# Set NLTK data path
+nltk.data.path.append(nltk_data_path)
 # Initialize stopwords and lemmatizer
 stop_words = set(stopwords.words('english')).union({"pm"})
 lemmatizer = WordNetLemmatizer()
+# ✅ Preprocessing Function
 def pre_process(x):
     x = x.lower()
     x = re.sub("<.*?>", "", x)
     x = emoji.demojize(x)
     x = re.sub(":.*?:", "", x)
     x = re.sub("[^a-zA-Z0-9\\s_]", "", x)
     words = word_tokenize(x)
     words = [word for word in words if word not in stop_words]
     x = " ".join([lemmatizer.lemmatize(word) for word in words])
     return x
+# ✅ Load Model and Vectorizer
 @st.cache_resource
 def load_model():
+    # Load the model
+    model = tf.keras.models.load_model("model_m3_new.keras")
+    # Load vectorizer (use pickle or joblib for sklearn models)
+    with open("vec_text_m3_new.pkl", 'rb') as file:
+        vectorizer = pickle.load(file)
+    # Load label encoder
     with open("label_encoder_m5.pkl", 'rb') as file:
         label_encoder = pickle.load(file)
     return model, vectorizer, label_encoder
+# Load models
 model, vectorizer, label_encoder = load_model()
+# ✅ Prediction Function
 def predict_category(text):
     processed_text = [pre_process(text)]
+    # Vectorize the input
+    text_vectorized = vectorizer.transform(processed_text).toarray()
+    # Pad the sequence
+    text_vectorized = pad_sequences(text_vectorized, padding='pre', maxlen=128)
+    # Model prediction
     prediction = model.predict(text_vectorized)
     category_idx = np.argmax(prediction, axis=1)[0]
+    # Return the category label
     return label_encoder.inverse_transform([category_idx])[0]
+# ✅ Streamlit UI
+st.title("AI-Powered News Categorization")
 input_text = st.text_area("Enter News Article:", height=200)
 if st.button("Analyze", key="analyze-btn", help="Click to classify the news article"):
         category = predict_category(input_text)
         st.markdown(f"<div class='result-box'>Predicted Category: {category}</div>", unsafe_allow_html=True)
     else:
+        st.warning("Please enter some text to analyze.")