File size: 1,424 Bytes
e8e2080
af43a92
e8e2080
8621c7a
e8e2080
8621c7a
d660f54
e8e2080
af43a92
e8e2080
8621c7a
 
e8e2080
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af43a92
 
e8e2080
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from flask import Flask
import gradio as gr
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import re

app = Flask(__name__)

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-z\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text)  # Remove extra whitespace
    return text.strip()

# Function to perform topic modeling
def extract_topics():
    # Read CSV file
    df = pd.read_csv("titles.csv")
    # Drop rows with missing titles
    df = df.dropna(subset=["title"])
    # Preprocess titles
    df["clean_title"] = df["title"].apply(clean_text)
    # Initialize embedding model
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    # Initialize BERTopic model
    topic_model = BERTopic(embedding_model=embedding_model, min_topic_size=2)
    # Fit the model
    topics, _ = topic_model.fit_transform(df["clean_title"].tolist())
    # Get topic information
    topic_info = topic_model.get_topic_info()
    return topic_info[["Topic", "Name", "Count"]].to_string(index=False)

# Gradio interface
demo = gr.Interface(fn=extract_topics, inputs=[], outputs="text")

@app.route("/")
def home():
    return demo.launch(share=False, inline=True)

if __name__ == "__main__":
    app.run()