import pandas as pd
import numpy as np
import gradio as gr
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

# ----------------------------
#  CSV laden und Spalten bereinigen
# ----------------------------

df = pd.read_csv("Testdaten_Mersen_kompatibel.csv", sep=";", encoding="utf-8-sig")
df.columns = (
    df.columns
    .str.strip()
    .str.replace(" ", " ", regex=False)  # geschützte Leerzeichen
)

# Datum verarbeiten
if "Anfrage_Datum" in df.columns:
    df["Anfrage_Datum"] = pd.to_datetime(df["Anfrage_Datum"], errors="coerce")
    df["Wochentag"] = df["Anfrage_Datum"].dt.day_name()
else:
    raise ValueError("Spalte 'Anfrage_Datum' nicht gefunden. Verfügbare Spalten: " + str(df.columns.tolist()))

# ----------------------------
#  Features & Ziel definieren
# ----------------------------

X = df[[
    "Kundentyp", "Branche", "Produktgruppe", "Region", "Kanal",
    "Dringlichkeit", "Wochentag", "Anfrage_Text", "Projektgröße (€)"
]]
y = df["Abschluss"]

categorical_features = ["Kundentyp", "Branche", "Produktgruppe", "Region",
                        "Kanal", "Dringlichkeit", "Wochentag"]
text_feature = "Anfrage_Text"
numeric_feature = "Projektgröße (€)"

# ----------------------------
#  Modell-Pipeline
# ----------------------------

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ("text", TfidfVectorizer(), text_feature),
    ("num", "passthrough", [numeric_feature])
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

pipeline.fit(X, y)

# ----------------------------
#  Vorhersagefunktion
# ----------------------------

def predict_lead(kundentyp, branche, produktgruppe, region, kanal,
                 dringlichkeit, wochentag, anfrage_text, projektgroesse):

    input_data = pd.DataFrame([{
        "Kundentyp": kundentyp,
        "Branche": branche,
        "Produktgruppe": produktgruppe,
        "Region": region,
        "Kanal": kanal,
        "Dringlichkeit": dringlichkeit,
        "Wochentag": wochentag,
        "Anfrage_Text": anfrage_text,
        "Projektgröße (€)": float(projektgroesse)
    }])

    prob = pipeline.predict_proba(input_data)[0][1]
    klasse = "hoch" if prob >= 0.75 else "mittel" if prob >= 0.4 else "niedrig"

    return f"Abschlusswahrscheinlichkeit: {prob:.2f} → Priorität: {klasse.upper()}"

# ----------------------------
#  Gradio UI
# ----------------------------

demo = gr.Interface(
    fn=predict_lead,
    inputs=[
        gr.Dropdown(["Neukunde", "Bestandskunde", "OEM"], label="Kundentyp"),
        gr.Dropdown(["Gebäude", "Infrastruktur"], label="Branche"),
        gr.Dropdown(["Sicherung", "Graphitmodul", "Isolationsmaterial", "Spezialfertigung"], label="Produktgruppe"),
        gr.Dropdown(["DACH"], label="Region"),
        gr.Dropdown(["Webformular", "E-Mail", "Vertriebspartner"], label="Kanal"),
        gr.Dropdown(["sofort", "Q1 2025", "Q2 2025", "nächstes Jahr", "unklar"], label="Dringlichkeit"),
        gr.Dropdown(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], label="Wochentag"),
        gr.Textbox(label="Anfrage-Text"),
        gr.Number(label="Projektgröße (€)")
    ],
    outputs="text",
    title="📈 Lead-Priorisierung für Angebotsanfragen bei Mersen",
    description="Dieses Modell bewertet Angebotsanfragen nach ihrer Abschlusswahrscheinlichkeit und priorisiert Leads."
)

if __name__ == "__main__":
    demo.launch()