Spaces:

hanifekaptan
/

string_similarity_encoder_algorithms

Sleeping

App Files Files Community

hanifekaptan commited on Aug 24, 2025

Commit

4f39bc7

verified ·

1 Parent(s): e2380b3

dosyalar yüklendi

Browse files

Files changed (7) hide show

.gitattributes +2 -0
src/app.py +147 -0
src/cross_encoder_model.py +42 -0
src/encoder_algorithm.png +0 -0
src/mixed_cross_encoder_model.py +54 -0
src/v2_cross_encoder.keras +3 -0
src/v2_mixed_data_cross_encoder.keras +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+src/v2_cross_encoder.keras filter=lfs diff=lfs merge=lfs -text
+src/v2_mixed_data_cross_encoder.keras filter=lfs diff=lfs merge=lfs -text

src/app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import streamlit as st
+import tensorflow as tf
+import numpy as np
+import pandas as pd
+from transformers import AutoTokenizer
+from cross_encoder_model import CrossEncoderTF
+from mixed_cross_encoder_model import MixedDataCrossEncoderTF
+MODEL_NAME = "dbmdz/bert-base-turkish-cased"
+SAVED_CROSS_ENCODER_MODEL_PATH = "src/v2_cross_encoder.keras"
+SAVED_MIXED_CROSS_ENCODER_MODEL_PATH = "src/v2_mixed_data_cross_encoder.keras"
+MAX_TOKEN_LEN = 32
+DATA_FILE_PATH = "model_0_data.csv"
+TEXT_COLS = ['STRA', 'STRB']
+LABEL_COL = 'DISTANCE'
+EXCLUDE_COLS = TEXT_COLS + [LABEL_COL, 'FILLER']
+NUMERICAL_FEATURE_DIM = 5132
+@st.cache_data
+def load_data():
+    try:
+        df = pd.read_csv(DATA_FILE_PATH, decimal=',', low_memory=False)
+    except FileNotFoundError:
+        st.error(f"Veri dosyası bulunamadı: {DATA_FILE_PATH}. Lütfen dosyanın uygulamanın çalıştığı dizinde olduğundan emin olun.")
+        st.stop()
+    return df
+@st.cache_resource
+def load_models_and_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    cross_encoder_model = tf.keras.models.load_model(
+        SAVED_CROSS_ENCODER_MODEL_PATH,
+        custom_objects={'CrossEncoderTF': CrossEncoderTF}
+    )
+    mixed_cross_encoder_model = tf.keras.models.load_model(
+        SAVED_MIXED_CROSS_ENCODER_MODEL_PATH,
+        custom_objects={'MixedDataCrossEncoderTF': MixedDataCrossEncoderTF,
+                        'numerical_feature_dim': NUMERICAL_FEATURE_DIM}
+    )
+    return tokenizer, cross_encoder_model, mixed_cross_encoder_model
+try:
+    df_data = load_data()
+    numerical_feature_cols = df_data.columns.drop(EXCLUDE_COLS).tolist()
+    NUMERICAL_FEATURE_DIM = len(numerical_feature_cols)
+    tokenizer, cross_encoder_model, mixed_cross_encoder_model = load_models_and_tokenizer()
+except Exception as e:
+    st.error(f"Yüklenirken bir hata oluştu: {e}")
+    st.stop()
+def predict(model, tokenizer, str_a, str_b, numerical_features=None):
+    tokenized = tokenizer(
+        str_a, str_b,
+        max_length=MAX_TOKEN_LEN,
+        padding='max_length',
+        truncation=True,
+        return_tensors='np'
+    )
+    model_input = {
+        'input_ids': tokenized['input_ids'],
+        'attention_mask': tokenized['attention_mask'],
+    }
+    if numerical_features is not None:
+        model_input['numerical_features'] = numerical_features.reshape(1, -1).astype('float32')
+    prediction = model.predict(model_input)
+    score = prediction[0][0]
+    return float(score)
+st.set_page_config(page_title="Varlık Benzerlik Testi", layout="centered")
+st.title("İki Model Karşılaştırmalı Varlık Benzerlik Test Arayüzü")
+st.info(
+    "Bu uygulama, metinsel verileri kullanarak iki varlığın "
+    "benzerlik olasılığını tahmin eder ve iki farklı modelin sonuçlarını karşılaştırır."
+    "(henüz bi-encoder mimarisi eklenmemiştir, sadece cross-encoder modeli kullanılıyor)"
+    "\n\n**Cross-encoder mimarisi:** yalnızca metin1, metin2 ve distance özellikleri ile eğitilmiştir."
+    "\n\n**Mixed-cross-encoder mimarisi:** metin1, metin2, distance ve numerik özellikler ile eğitilmiştir."
+)
+st.image("src/encoder_algorithm.png", caption="Encoder Algoritma Akışı", use_container_width=True)
+st.header("Girdi String'leri")
+stra_options = df_data['STRA'].unique()
+str_a_input = st.selectbox("String A (STRA)", stra_options)
+filtered_strb_options = df_data[df_data['STRA'] == str_a_input]['STRB'].unique()
+str_b_input = st.selectbox("String B (STRB)", filtered_strb_options)
+if st.button("Benzerliği Hesapla", type="primary"):
+    if not str_a_input or not str_b_input:
+        st.error("Lütfen her iki string alanını da seçin.")
+    else:
+        with st.spinner("Tahminler yapılıyor..."):
+            selected_row = df_data[(df_data['STRA'] == str_a_input) & (df_data['STRB'] == str_b_input)]
+            if not selected_row.empty:
+                numerical_features_for_prediction = selected_row[numerical_feature_cols].iloc[0].values
+            else:
+                st.error("Seçilen string'lere ait veri bulunamadı. Lütfen farklı seçimler yapın.")
+                st.stop()
+            cross_encoder_distance_score = predict(cross_encoder_model, tokenizer, str_a_input, str_b_input)
+            cross_encoder_similarity_score = 1 - cross_encoder_distance_score
+            mixed_cross_encoder_distance_score = predict(mixed_cross_encoder_model, tokenizer, str_a_input, str_b_input, numerical_features_for_prediction)
+            mixed_cross_encoder_similarity_score = 1 - mixed_cross_encoder_distance_score
+            actual_row = df_data[(df_data['STRA'] == str_a_input) & (df_data['STRB'] == str_b_input)]
+            if not actual_row.empty:
+                actual_distance = actual_row[LABEL_COL].iloc[0]
+                actual_similarity = 1 - actual_distance
+            else:
+                actual_distance = np.nan
+                actual_similarity = np.nan
+            st.subheader("Karşılaştırmalı Sonuçlar")
+            results_data = {
+                "Özellik": ["Tahmin Edilen Benzerlik", "Gerçek Benzerlik", "Tahmin Edilen Mesafe", "Gerçek Mesafe", "Karar"],
+                "Cross-Encoder Model": [
+                    f"{cross_encoder_similarity_score:.4f}",
+                    f"{actual_similarity:.4f}" if not np.isnan(actual_similarity) else "N/A",
+                    f"{cross_encoder_distance_score:.4f}",
+                    f"{actual_distance:.4f}" if not np.isnan(actual_distance) else "N/A",
+                    "BENZER" if cross_encoder_similarity_score > 0.5 else "BENZER DEĞİL"
+                ],
+                "Mixed Cross-Encoder Model": [
+                    f"{mixed_cross_encoder_similarity_score:.4f}",
+                    f"{actual_similarity:.4f}" if not np.isnan(actual_similarity) else "N/A",
+                    f"{mixed_cross_encoder_distance_score:.4f}",
+                    f"{actual_distance:.4f}" if not np.isnan(actual_distance) else "N/A",
+                    "BENZER" if mixed_cross_encoder_similarity_score > 0.5 else "BENZER DEĞİL"
+                ]
+            }
+            results_df = pd.DataFrame(results_data).set_index("Özellik")
+            st.dataframe(results_df)
+            st.markdown("---")
+            st.markdown(f"**Cross-Encoder Model Kararı:** `{str_a_input}` ve `{str_b_input}` kelimeleri **:{'blue' if cross_encoder_similarity_score > 0.5 else 'red'}[{'BENZER' if cross_encoder_similarity_score > 0.5 else 'BENZER DEĞİL'}]**.")
+            st.markdown(f"**Mixed Cross-Encoder Model Kararı:** `{str_a_input}` ve `{str_b_input}` kelimeleri **:{'blue' if mixed_cross_encoder_similarity_score > 0.5 else 'red'}[{'BENZER' if mixed_cross_encoder_similarity_score > 0.5 else 'BENZER DEĞİL'}]**.")

src/cross_encoder_model.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# src/model.py
+import tensorflow as tf
+from tensorflow.keras import layers, Model
+from transformers import TFAutoModel
+class CrossEncoderTF(Model):
+    def __init__(self, model_name="dbmdz/bert-base-turkish-cased", max_token_len=32, **kwargs):
+        super().__init__(**kwargs)
+        self.model_name = model_name
+        self.max_token_len = max_token_len
+        # 1. Metin Hattı (Transformer)
+        self.bert = TFAutoModel.from_pretrained(model_name)
+        # 2. Sadece çıktı katmanı
+        self.classifier = tf.keras.Sequential([
+            layers.Dense(256, activation='relu'),
+            layers.BatchNormalization(),
+            layers.Dropout(0.3),
+            layers.Dense(128, activation='relu'),
+            layers.BatchNormalization(),
+            layers.Dense(64, activation='relu'),
+            layers.BatchNormalization(),
+            layers.Dense(1, activation='sigmoid')
+        ], name="classifier")
+    def call(self, inputs):
+        bert_output = self.bert(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
+        text_features = bert_output.pooler_output
+        prediction_score = self.classifier(text_features)
+        return prediction_score
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "model_name": self.model_name,
+            "max_token_len": self.max_token_len,
+        })
+        return config

src/encoder_algorithm.png ADDED Viewed

src/mixed_cross_encoder_model.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# src/model.py
+import tensorflow as tf
+from tensorflow.keras import layers, Model
+from transformers import TFAutoModel
+class MixedDataCrossEncoderTF(Model):
+    def __init__(self, model_name="dbmdz/bert-base-turkish-cased", numerical_feature_dim=5132, max_token_len=32, **kwargs):
+        super().__init__(**kwargs)
+        self.model_name = model_name
+        self.numerical_feature_dim = numerical_feature_dim
+        self.max_token_len = max_token_len
+        self.bert = TFAutoModel.from_pretrained(model_name)
+        self.numerical_mlp = tf.keras.Sequential([
+            layers.Input(shape=(numerical_feature_dim,)),
+            layers.Dense(512, activation='relu'),
+            layers.Dropout(0.3),
+            layers.Dense(128, activation='relu')
+        ], name="numerical_mlp")
+        self.concatenation = layers.Concatenate()
+        self.classifier = tf.keras.Sequential([
+            layers.Dense(256, activation='relu'),
+            layers.BatchNormalization(),
+            layers.Dropout(0.3),
+            layers.Dense(128, activation='relu'),
+            layers.BatchNormalization(),
+            layers.Dense(64, activation='relu'),
+            layers.BatchNormalization(),
+            layers.Dense(1, activation='sigmoid')
+        ], name="classifier")
+    def call(self, inputs):
+        bert_output = self.bert(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
+        text_features = bert_output.pooler_output
+        numerical_processed_features = self.numerical_mlp(inputs['numerical_features'])
+        combined_features = self.concatenation([text_features, numerical_processed_features])
+        prediction_score = self.classifier(combined_features)
+        return prediction_score
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "model_name": self.model_name,
+            "numerical_feature_dim": self.numerical_feature_dim,
+            "max_token_len": self.max_token_len, # max_token_len'i de ekliyoruz
+        })
+        return config

src/v2_cross_encoder.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:279deaf2759bce8a936ae23d4486e3e97f6c8c9c12e560e66662e81799aa7bc7
+size 2916631

src/v2_mixed_data_cross_encoder.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d9f8f22a50b2c990c49ce98915547012f8a82e8192c1a8e3a4a82e59b432b66
+size 35650863