File size: 3,522 Bytes
fbf137c
45cf4ac
fbf137c
45cf4ac
fbf137c
 
 
 
 
 
 
 
 
 
45cf4ac
fbf137c
 
 
 
 
 
 
 
 
45cf4ac
fbf137c
45cf4ac
 
fbf137c
45cf4ac
fbf137c
 
 
 
53ed032
fbf137c
 
45cf4ac
 
 
fbf137c
 
 
45cf4ac
fbf137c
 
 
 
 
 
45cf4ac
 
fbf137c
45cf4ac
fbf137c
 
 
 
 
 
45cf4ac
 
 
 
fbf137c
 
45cf4ac
 
 
fbf137c
53ed032
fbf137c
53ed032
45cf4ac
53ed032
0f7f6fa
fbf137c
53ed032
45cf4ac
 
53ed032
 
 
 
45cf4ac
 
fbf137c
53ed032
fbf137c
0f2ab56
fbf137c
0f7f6fa
53ed032
45cf4ac
fbf137c
 
 
45cf4ac
 
 
fbf137c
8e26ef9
 
 
 
 
fbf137c
45cf4ac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import io
import zipfile
import tempfile
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import plotly.express as px
import gradio as gr

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

embed_df = pd.DataFrame()

def analyze_bigrams(zip_file, perplexity):
    global embed_df
    if zip_file is None:
        return "Please upload a ZIP file containing .txt files.", None

    zip_stream = io.BytesIO(zip_file)

    with tempfile.TemporaryDirectory() as tmpdir:
        with zipfile.ZipFile(zip_stream, 'r') as zip_ref:
            zip_ref.extractall(tmpdir)

        txt_files = [os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if f.endswith(".txt")]
        if not txt_files:
            return "No .txt files found.", None

        all_texts = []
        for path in txt_files:
            with open(path, "r", encoding="utf-8") as f:
                all_texts.append(f.read().lower())

        bigram_counter = {}
        for text in all_texts:
            tokens = [w for w in word_tokenize(text) if w.isalpha() and w not in stop_words]
            bigrams = ngrams(tokens, 2)
            for bg in bigrams:
                phrase = ' '.join(bg)
                bigram_counter[phrase] = bigram_counter.get(phrase, 0) + 1

        top_bigrams = sorted(bigram_counter.items(), key=lambda x: x[1], reverse=True)[:100]
        bigram_texts = [x[0] for x in top_bigrams]
        counts = [x[1] for x in top_bigrams]

        model = SentenceTransformer("all-MiniLM-L6-v2")
        embeddings = model.encode(bigram_texts)

        tsne = TSNE(n_components=2, perplexity=int(perplexity), random_state=42)
        tsne_results = tsne.fit_transform(embeddings)

        embed_df = pd.DataFrame({
            "bigram": bigram_texts,
            "count": counts,
            "tsne_1": tsne_results[:, 0],
            "tsne_2": tsne_results[:, 1],
        })

        fig = px.scatter(embed_df, x="tsne_1", y="tsne_2", hover_name="bigram",
                         size="count", template="plotly_white", title="Bigram t-SNE Projection")
        fig.update_layout(dragmode="lasso")

        return "✅ Bigram analysis complete. Use lasso to select points.", fig

def generate_bar_plot(events):
    global embed_df
    if not events or embed_df.empty:
        return None

    selected_indices = [pt["pointIndex"] for pt in events]
    selected_df = embed_df.iloc[selected_indices]

    fig = px.bar(selected_df.sort_values("count", ascending=False),
                 x="count", y="bigram", orientation="h",
                 title="Selected Bigram Frequencies")
    return fig

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## 📦 Upload ZIP of .txt files to Analyze Bigrams")

    zip_input = gr.File(label="Upload ZIP File of .txt Files", type="binary")
    perplexity_input = gr.Number(label="t-SNE Perplexity", value=30)

    analyze_btn = gr.Button("Analyze")
    status = gr.Label()
    scatter_plot = gr.Plot()
    bar_plot = gr.Plot()

    analyze_btn.click(analyze_bigrams,
                      inputs=[zip_input, perplexity_input],
                      outputs=[status, scatter_plot])

    gr.plotly_events(scatter_plot, select_event=True)(
        generate_bar_plot,
        inputs=None,
        outputs=bar_plot
    )

demo.launch(share=True)