Spaces:

Framby
/

P7

Sleeping

App Files Files Community

Framby commited on Jul 16, 2025

Commit

8beae03

1 Parent(s): 3ec8e71

First commit

Browse files

Files changed (5) hide show

.gitignore +59 -0
Pipfile +11 -0
app.py +113 -0
model.py +16 -0
requirements.txt +31 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,59 @@

+# === Python bytecode ===
+__pycache__/
+*.py[cod]
+*$py.class
+# === Jupyter Notebooks checkpoints ===
+.ipynb_checkpoints
+# === Virtual environment ===
+.venv/
+venv/
+env/
+ENV/
+# === OS files ===
+.DS_Store
+Thumbs.db
+# === Streamlit cache ===
+.streamlit/cache/
+.streamlit/config.toml
+# === PyTorch checkpoints and model files ===
+*.pt
+*.pth
+*.bin
+# === Tokenizer and transformers cache ===
+.cache/
+transformers_cache/
+huggingface/
+# === Dataset or outputs ===
+*.csv
+*.tsv
+*.json
+*.xlsx
+*.log
+*.npy
+*.npz
+# === Model artifacts ===
+*.joblib
+*.pkl
+# === Environment files ===
+.env
+.env.*
+# === VSCode / IDE ===
+.vscode/
+.idea/
+# === Misc ===
+*.zip
+*.tar.gz
+*.egg-info/
+build/
+dist/

Pipfile ADDED Viewed

	@@ -0,0 +1,11 @@

+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+[packages]
+[dev-packages]
+[requires]
+python_version = "3.12"

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# app.py
+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import plotly.express as px
+from wordcloud import WordCloud
+from collections import Counter
+import torch
+from transformers import AutoTokenizer
+import joblib
+from model import MultiLabelDeberta
+# ========== Загрузка модели и данных ==========
+st.set_page_config(page_title="Tag Predictor", layout="wide")
+@st.cache_resource
+def load_model_and_tokenizer():
+    mlb = joblib.load("mlb.pkl")
+    model = MultiLabelDeberta(num_labels=len(mlb.classes_))
+    model.load_state_dict(torch.load(
+        "deberta_multilabel.pt", map_location="cpu"))
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(
+        "microsoft/deberta-v3-base", use_fast=False)
+    return model, tokenizer, mlb
+model, tokenizer, mlb = load_model_and_tokenizer()
+# ========== Загрузка данных ==========
+@st.cache_data
+def load_data():
+    X = pd.read_csv('X_text.csv')['text_clean'].astype(str)
+    Y = pd.read_csv('Y_tags.csv', converters={'Tags': eval})['Tags']
+    return X, Y
+X, Y = load_data()
+# ========== Функция предсказания ==========
+def predict_tags(text, threshold=0.5):
+    inputs = tokenizer(
+        text,
+        return_tensors='pt',
+        truncation=True,
+        max_length=512,
+        padding='max_length'
+    )
+    inputs.pop('token_type_ids', None)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    probs = torch.sigmoid(outputs).squeeze().cpu().numpy()
+    binary_preds = (probs >= threshold).astype(int)
+    predicted_tags = mlb.inverse_transform(
+        np.expand_dims(binary_preds, axis=0))
+    return predicted_tags[0]
+# ========== Интерфейс ==========
+st.title("Prédicteur de Tags StackOverflow")
+st.markdown("## 1. Analyse des données textuelles")
+col1, col2 = st.columns(2)
+with col1:
+    st.markdown("### Distribution de la longueur des questions")
+    text_lengths = X.apply(lambda x: len(x.split()))
+    fig = px.histogram(text_lengths, nbins=30,
+                       title="Distribution de la longueur des questions")
+    st.plotly_chart(fig, use_container_width=True)
+with col2:
+    st.markdown("### Mots les plus fréquents")
+    all_words = " ".join(X).split()
+    word_freq = Counter(all_words)
+    most_common_words = pd.DataFrame(
+        word_freq.most_common(20), columns=['Mot', 'Nombre'])
+    fig2 = px.bar(most_common_words, x='Mot', y='Nombre',
+                  title="20 mots les plus fréquents")
+    st.plotly_chart(fig2, use_container_width=True)
+st.markdown("### Nuage de mots")
+wc = WordCloud(width=800, height=300,
+               background_color='white').generate(" ".join(X))
+fig_wc, ax = plt.subplots(figsize=(10, 4))
+ax.imshow(wc, interpolation='bilinear')
+ax.axis("off")
+st.pyplot(fig_wc)
+st.markdown("---")
+st.markdown("## 2. Prédiction des tags")
+input_text = st.text_area("Entrez une question StackOverflow", height=150)
+threshold = st.slider("Seuil de probabilité", 0.1, 0.9, 0.5, 0.05)
+if st.button("Prédire les tags"):
+    if input_text.strip():
+        tags = predict_tags(input_text, threshold)
+        if tags:
+            st.success("Tags prédits :")
+            st.write(", ".join(tags))
+        else:
+            st.warning("Aucun tag trouvé pour le seuil sélectionné.")
+    else:
+        st.warning("Veuillez entrer une question.")

model.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch.nn as nn
+from transformers import AutoTokenizer, AutoModel
+class MultiLabelDeberta(nn.Module):
+    def __init__(self, num_labels):
+        super().__init__()
+        self.backbone = AutoModel.from_pretrained('microsoft/deberta-v3-base')
+        self.dropout = nn.Dropout(0.3)
+        self.classifier = nn.Linear(self.backbone.config.hidden_size, num_labels)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
+        pooled = outputs.last_hidden_state[:, 0]  # [CLS]
+        pooled = self.dropout(pooled)
+        logits = self.classifier(pooled)
+        return logits

requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+# === Core data libraries ===
+pandas>=1.3.0
+numpy>=1.21.0
+# === Visualization ===
+matplotlib>=3.5.0
+plotly>=5.3.1
+wordcloud>=1.8.1
+pillow>=9.0.0
+# === Web app interface ===
+streamlit>=1.20.0
+watchdog>=2.1.6  # improves file change detection in Streamlit
+# === NLP & Transformers ===
+torch>=2.0.0
+transformers>=4.31.0
+tokenizers>=0.13.3
+joblib>=1.2.0
+# === Text preprocessing ===
+beautifulsoup4>=4.12.0
+nltk>=3.8.1
+regex>=2023.12.25
+# === Progress bar (optional but common in model inference) ===
+tqdm>=4.64.0
+# === ML Utilities ===
+scikit-learn>=1.3.0
+sentencepiece>=0.1.99