File size: 6,507 Bytes
e964457
 
 
 
 
 
 
 
 
e066c20
 
e964457
023e889
e964457
 
ffb0cc4
e964457
 
 
090583a
e964457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f72ec8
 
e964457
5f72ec8
 
e964457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffb0cc4
e964457
4834dfe
e964457
 
 
 
 
 
 
 
 
 
 
 
54d53f4
4834dfe
6eac30f
ffb0cc4
54d53f4
4834dfe
6eac30f
54d53f4
4834dfe
 
6eac30f
e964457
3a5f15a
 
 
 
ffb0cc4
ed9bb4a
d21afd0
6cef51d
 
 
 
 
e066c20
 
 
 
 
 
 
 
 
 
 
 
e964457
023e889
e964457
ffb0cc4
e964457
 
 
 
 
 
 
023e889
 
e066c20
 
6cef51d
e066c20
e964457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e066c20
6cef51d
 
 
e066c20
 
 
e964457
 
 
 
 
 
 
 
 
ed9bb4a
023e889
e964457
023e889
e066c20
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import streamlit as st
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import pipeline
from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load Models
news_classifier = pipeline("text-classification", model="Oneli/News_Classification")
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

# Label Mapping
label_mapping = {
    "LABEL_0": "Business",
    "LABEL_1": "Opinion",
    "LABEL_2": "Political Gossip",
    "LABEL_3": "Sports",
    "LABEL_4": "World News"
}

# Store classified article for QA
context_storage = {"context": "", "bulk_context": "", "num_articles": 0}

# Text Cleaning Functions
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    words = text.split()  # Tokenization without Punkt
    words = [word for word in words if word not in stopwords.words("english")]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize tokens
    return " ".join(words)

# Define the functions
def classify_text(text):
    cleaned_text = clean_text(text)
    result = news_classifier(cleaned_text)[0]
    category = label_mapping.get(result['label'], "Unknown")
    confidence = round(result['score'] * 100, 2)

    # Store context for QA
    context_storage["context"] = cleaned_text

    return category, f"Confidence: {confidence}%"

def classify_csv(file):
    try:
        df = pd.read_csv(file, encoding="utf-8")
        text_column = df.columns[0]  # Assume first column is the text column
        
        df[text_column] = df[text_column].astype(str).apply(clean_text)  # Clean text column
        df["Decoded Prediction"] = df[text_column].apply(lambda x: label_mapping.get(news_classifier(x)[0]['label'], "Unknown"))
        df["Confidence"] = df[text_column].apply(lambda x: round(news_classifier(x)[0]['score'] * 100, 2))

        # Store all text as a single context for QA
        context_storage["bulk_context"] = " ".join(df[text_column].dropna().astype(str).tolist())
        context_storage["num_articles"] = len(df)

        output_file = "output.csv"
        df.to_csv(output_file, index=False)
        return df, output_file
    except Exception as e:
        return None, f"Error: {str(e)}"

def chatbot_response(history, user_input, text_input=None, file_input=None):
    user_input = user_input.lower()
    context = ""
    
    if text_input:
        context += text_input
    
    if file_input:
        df, _ = classify_csv(file_input)
        context += context_storage["bulk_context"]
    
    if context:
        with st.spinner("Finding answer..."):
            result = qa_pipeline(question=user_input, context=context)
            answer = result["answer"]
            history.append([user_input, answer])
    
    return history, answer

# Function to generate word cloud from the 'content' column (from output CSV)
def generate_word_cloud_from_output(df):
    # Assuming 'content' column is the first column after processing
    content_text = " ".join(df["content"].dropna().astype(str).tolist())
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(content_text)
    return wordcloud

# Function to generate bar graph for decoded predictions
def generate_bar_graph(df):
    prediction_counts = df["Decoded Prediction"].value_counts()
    fig, ax = plt.subplots(figsize=(10, 6))
    prediction_counts.plot(kind='bar', ax=ax, color='skyblue')
    ax.set_title('Frequency of Decoded Predictions', fontsize=16)
    ax.set_xlabel('Category', fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)
    st.pyplot(fig)

# Streamlit App Layout
st.set_page_config(page_title="News Classifier", page_icon="πŸ“°")
cover_image = Image.open("cover.png")  # Ensure this image exists
st.image(cover_image, caption="News Classifier πŸ“’", use_container_width=True)

# Section for Single Article Classification
st.subheader("πŸ“° Single Article Classification")
text_input = st.text_area("Enter News Text", placeholder="Type or paste news content here...")
if st.button("πŸ” Classify"):
    if text_input:
        category, confidence = classify_text(text_input)
        st.write(f"Predicted Category: {category}")
        st.write(f"Confidence Level: {confidence}")
        
        # Generate word cloud for the cleaned text input
        wordcloud = generate_word_cloud_from_output(pd.DataFrame({"content": [text_input]}))  # Create a DataFrame for single input
        st.image(wordcloud.to_array(), caption="Word Cloud for Text Input", use_container_width=True)
    else:
        st.warning("Please enter some text to classify.")

# Section for Bulk CSV Classification
st.subheader("πŸ“‚ Bulk Classification (CSV)")
file_input = st.file_uploader("Upload CSV File", type="csv")
if file_input:
    df, output_file = classify_csv(file_input)
    if df is not None:
        st.dataframe(df)
        st.download_button(
            label="Download Processed CSV",
            data=open(output_file, 'rb').read(),
            file_name=output_file,
            mime="text/csv"
        )
        
        # Generate word cloud for the 'content' column of the processed CSV data
        wordcloud = generate_word_cloud_from_output(df)
        st.image(wordcloud.to_array(), caption="Word Cloud for CSV Content", use_container_width=True)
        
        # Generate bar graph for decoded predictions frequency
        generate_bar_graph(df)
    else:
        st.error(f"Error processing file: {output_file}")

# Section for Chatbot Interaction
st.subheader("πŸ’¬ AI Chat Assistant")
history = []
user_input = st.text_input("Ask about news classification or topics", placeholder="Type a message...")
source_toggle = st.radio("Select Context Source", ["Single Article", "Bulk Classification"])
if st.button("βœ‰ Send"):
    history, bot_response = chatbot_response(history, user_input, text_input=text_input if source_toggle == "Single Article" else None, file_input=file_input if source_toggle == "Bulk Classification" else None)
    st.write("Chatbot Response:")
    for q, a in history:
        st.write(f"Q: {q}")
        st.write(f"A: {a}")