hanifekaptan commited on
Commit
2e62cc6
·
verified ·
1 Parent(s): 2d78e50

data path güncellendi

Browse files
Files changed (1) hide show
  1. src/app.py +147 -147
src/app.py CHANGED
@@ -1,147 +1,147 @@
1
- import streamlit as st
2
- import tensorflow as tf
3
- import numpy as np
4
- import pandas as pd
5
- from transformers import AutoTokenizer
6
- from cross_encoder_model import CrossEncoderTF
7
- from mixed_cross_encoder_model import MixedDataCrossEncoderTF
8
-
9
- MODEL_NAME = "dbmdz/bert-base-turkish-cased"
10
- SAVED_CROSS_ENCODER_MODEL_PATH = "src/v2_cross_encoder.keras"
11
- SAVED_MIXED_CROSS_ENCODER_MODEL_PATH = "src/v2_mixed_data_cross_encoder.keras"
12
- MAX_TOKEN_LEN = 32
13
- DATA_FILE_PATH = "model_0_data.csv"
14
- TEXT_COLS = ['STRA', 'STRB']
15
- LABEL_COL = 'DISTANCE'
16
- EXCLUDE_COLS = TEXT_COLS + [LABEL_COL, 'FILLER']
17
- NUMERICAL_FEATURE_DIM = 5132
18
-
19
- @st.cache_data
20
- def load_data():
21
- try:
22
- df = pd.read_csv(DATA_FILE_PATH, decimal=',', low_memory=False)
23
- except FileNotFoundError:
24
- st.error(f"Veri dosyası bulunamadı: {DATA_FILE_PATH}. Lütfen dosyanın uygulamanın çalıştığı dizinde olduğundan emin olun.")
25
- st.stop()
26
- return df
27
-
28
- @st.cache_resource
29
- def load_models_and_tokenizer():
30
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
31
-
32
- cross_encoder_model = tf.keras.models.load_model(
33
- SAVED_CROSS_ENCODER_MODEL_PATH,
34
- custom_objects={'CrossEncoderTF': CrossEncoderTF}
35
- )
36
-
37
- mixed_cross_encoder_model = tf.keras.models.load_model(
38
- SAVED_MIXED_CROSS_ENCODER_MODEL_PATH,
39
- custom_objects={'MixedDataCrossEncoderTF': MixedDataCrossEncoderTF,
40
- 'numerical_feature_dim': NUMERICAL_FEATURE_DIM}
41
- )
42
-
43
- return tokenizer, cross_encoder_model, mixed_cross_encoder_model
44
-
45
- try:
46
- df_data = load_data()
47
- numerical_feature_cols = df_data.columns.drop(EXCLUDE_COLS).tolist()
48
- NUMERICAL_FEATURE_DIM = len(numerical_feature_cols)
49
- tokenizer, cross_encoder_model, mixed_cross_encoder_model = load_models_and_tokenizer()
50
- except Exception as e:
51
- st.error(f"Yüklenirken bir hata oluştu: {e}")
52
- st.stop()
53
-
54
- def predict(model, tokenizer, str_a, str_b, numerical_features=None):
55
- tokenized = tokenizer(
56
- str_a, str_b,
57
- max_length=MAX_TOKEN_LEN,
58
- padding='max_length',
59
- truncation=True,
60
- return_tensors='np'
61
- )
62
-
63
- model_input = {
64
- 'input_ids': tokenized['input_ids'],
65
- 'attention_mask': tokenized['attention_mask'],
66
- }
67
-
68
- if numerical_features is not None:
69
- model_input['numerical_features'] = numerical_features.reshape(1, -1).astype('float32')
70
-
71
- prediction = model.predict(model_input)
72
- score = prediction[0][0]
73
-
74
- return float(score)
75
-
76
- st.set_page_config(page_title="Varlık Benzerlik Testi", layout="centered")
77
- st.title("İki Model Karşılaştırmalı Varlık Benzerlik Test Arayüzü")
78
-
79
- st.info(
80
- "Bu uygulama, metinsel verileri kullanarak iki varlığın "
81
- "benzerlik olasılığını tahmin eder ve iki farklı modelin sonuçlarını karşılaştırır."
82
- "(henüz bi-encoder mimarisi eklenmemiştir, sadece cross-encoder modeli kullanılıyor)"
83
- "\n\n**Cross-encoder mimarisi:** yalnızca metin1, metin2 ve distance özellikleri ile eğitilmiştir."
84
- "\n\n**Mixed-cross-encoder mimarisi:** metin1, metin2, distance ve numerik özellikler ile eğitilmiştir."
85
- )
86
-
87
- st.image("src/encoder_algorithm.png", caption="Encoder Algoritma Akışı", use_container_width=True)
88
-
89
- st.header("Girdi String'leri")
90
-
91
- stra_options = df_data['STRA'].unique()
92
- str_a_input = st.selectbox("String A (STRA)", stra_options)
93
-
94
- filtered_strb_options = df_data[df_data['STRA'] == str_a_input]['STRB'].unique()
95
- str_b_input = st.selectbox("String B (STRB)", filtered_strb_options)
96
-
97
- if st.button("Benzerliği Hesapla", type="primary"):
98
- if not str_a_input or not str_b_input:
99
- st.error("Lütfen her iki string alanını da seçin.")
100
- else:
101
- with st.spinner("Tahminler yapılıyor..."):
102
- selected_row = df_data[(df_data['STRA'] == str_a_input) & (df_data['STRB'] == str_b_input)]
103
- if not selected_row.empty:
104
- numerical_features_for_prediction = selected_row[numerical_feature_cols].iloc[0].values
105
- else:
106
- st.error("Seçilen string'lere ait veri bulunamadı. Lütfen farklı seçimler yapın.")
107
- st.stop()
108
-
109
- cross_encoder_distance_score = predict(cross_encoder_model, tokenizer, str_a_input, str_b_input)
110
- cross_encoder_similarity_score = 1 - cross_encoder_distance_score
111
-
112
- mixed_cross_encoder_distance_score = predict(mixed_cross_encoder_model, tokenizer, str_a_input, str_b_input, numerical_features_for_prediction)
113
- mixed_cross_encoder_similarity_score = 1 - mixed_cross_encoder_distance_score
114
-
115
- actual_row = df_data[(df_data['STRA'] == str_a_input) & (df_data['STRB'] == str_b_input)]
116
- if not actual_row.empty:
117
- actual_distance = actual_row[LABEL_COL].iloc[0]
118
- actual_similarity = 1 - actual_distance
119
- else:
120
- actual_distance = np.nan
121
- actual_similarity = np.nan
122
-
123
- st.subheader("Karşılaştırmalı Sonuçlar")
124
-
125
- results_data = {
126
- "Özellik": ["Tahmin Edilen Benzerlik", "Gerçek Benzerlik", "Tahmin Edilen Mesafe", "Gerçek Mesafe", "Karar"],
127
- "Cross-Encoder Model": [
128
- f"{cross_encoder_similarity_score:.4f}",
129
- f"{actual_similarity:.4f}" if not np.isnan(actual_similarity) else "N/A",
130
- f"{cross_encoder_distance_score:.4f}",
131
- f"{actual_distance:.4f}" if not np.isnan(actual_distance) else "N/A",
132
- "BENZER" if cross_encoder_similarity_score > 0.5 else "BENZER DEĞİL"
133
- ],
134
- "Mixed Cross-Encoder Model": [
135
- f"{mixed_cross_encoder_similarity_score:.4f}",
136
- f"{actual_similarity:.4f}" if not np.isnan(actual_similarity) else "N/A",
137
- f"{mixed_cross_encoder_distance_score:.4f}",
138
- f"{actual_distance:.4f}" if not np.isnan(actual_distance) else "N/A",
139
- "BENZER" if mixed_cross_encoder_similarity_score > 0.5 else "BENZER DEĞİL"
140
- ]
141
- }
142
- results_df = pd.DataFrame(results_data).set_index("Özellik")
143
- st.dataframe(results_df)
144
-
145
- st.markdown("---")
146
- st.markdown(f"**Cross-Encoder Model Kararı:** `{str_a_input}` ve `{str_b_input}` kelimeleri **:{'blue' if cross_encoder_similarity_score > 0.5 else 'red'}[{'BENZER' if cross_encoder_similarity_score > 0.5 else 'BENZER DEĞİL'}]**.")
147
- st.markdown(f"**Mixed Cross-Encoder Model Kararı:** `{str_a_input}` ve `{str_b_input}` kelimeleri **:{'blue' if mixed_cross_encoder_similarity_score > 0.5 else 'red'}[{'BENZER' if mixed_cross_encoder_similarity_score > 0.5 else 'BENZER DEĞİL'}]**.")
 
1
+ import streamlit as st
2
+ import tensorflow as tf
3
+ import numpy as np
4
+ import pandas as pd
5
+ from transformers import AutoTokenizer
6
+ from cross_encoder_model import CrossEncoderTF
7
+ from mixed_cross_encoder_model import MixedDataCrossEncoderTF
8
+
9
+ MODEL_NAME = "dbmdz/bert-base-turkish-cased"
10
+ SAVED_CROSS_ENCODER_MODEL_PATH = "src/v2_cross_encoder.keras"
11
+ SAVED_MIXED_CROSS_ENCODER_MODEL_PATH = "src/v2_mixed_data_cross_encoder.keras"
12
+ MAX_TOKEN_LEN = 32
13
+ DATA_FILE_PATH = "src/model_0_data.csv"
14
+ TEXT_COLS = ['STRA', 'STRB']
15
+ LABEL_COL = 'DISTANCE'
16
+ EXCLUDE_COLS = TEXT_COLS + [LABEL_COL, 'FILLER']
17
+ NUMERICAL_FEATURE_DIM = 5132
18
+
19
+ @st.cache_data
20
+ def load_data():
21
+ try:
22
+ df = pd.read_csv(DATA_FILE_PATH, decimal=',', low_memory=False)
23
+ except FileNotFoundError:
24
+ st.error(f"Veri dosyası bulunamadı: {DATA_FILE_PATH}. Lütfen dosyanın uygulamanın çalıştığı dizinde olduğundan emin olun.")
25
+ st.stop()
26
+ return df
27
+
28
+ @st.cache_resource
29
+ def load_models_and_tokenizer():
30
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
31
+
32
+ cross_encoder_model = tf.keras.models.load_model(
33
+ SAVED_CROSS_ENCODER_MODEL_PATH,
34
+ custom_objects={'CrossEncoderTF': CrossEncoderTF}
35
+ )
36
+
37
+ mixed_cross_encoder_model = tf.keras.models.load_model(
38
+ SAVED_MIXED_CROSS_ENCODER_MODEL_PATH,
39
+ custom_objects={'MixedDataCrossEncoderTF': MixedDataCrossEncoderTF,
40
+ 'numerical_feature_dim': NUMERICAL_FEATURE_DIM}
41
+ )
42
+
43
+ return tokenizer, cross_encoder_model, mixed_cross_encoder_model
44
+
45
+ try:
46
+ df_data = load_data()
47
+ numerical_feature_cols = df_data.columns.drop(EXCLUDE_COLS).tolist()
48
+ NUMERICAL_FEATURE_DIM = len(numerical_feature_cols)
49
+ tokenizer, cross_encoder_model, mixed_cross_encoder_model = load_models_and_tokenizer()
50
+ except Exception as e:
51
+ st.error(f"Yüklenirken bir hata oluştu: {e}")
52
+ st.stop()
53
+
54
+ def predict(model, tokenizer, str_a, str_b, numerical_features=None):
55
+ tokenized = tokenizer(
56
+ str_a, str_b,
57
+ max_length=MAX_TOKEN_LEN,
58
+ padding='max_length',
59
+ truncation=True,
60
+ return_tensors='np'
61
+ )
62
+
63
+ model_input = {
64
+ 'input_ids': tokenized['input_ids'],
65
+ 'attention_mask': tokenized['attention_mask'],
66
+ }
67
+
68
+ if numerical_features is not None:
69
+ model_input['numerical_features'] = numerical_features.reshape(1, -1).astype('float32')
70
+
71
+ prediction = model.predict(model_input)
72
+ score = prediction[0][0]
73
+
74
+ return float(score)
75
+
76
+ st.set_page_config(page_title="Varlık Benzerlik Testi", layout="centered")
77
+ st.title("İki Model Karşılaştırmalı Varlık Benzerlik Test Arayüzü")
78
+
79
+ st.info(
80
+ "Bu uygulama, metinsel verileri kullanarak iki varlığın "
81
+ "benzerlik olasılığını tahmin eder ve iki farklı modelin sonuçlarını karşılaştırır."
82
+ "(henüz bi-encoder mimarisi eklenmemiştir, sadece cross-encoder modeli kullanılıyor)"
83
+ "\n\n**Cross-encoder mimarisi:** yalnızca metin1, metin2 ve distance özellikleri ile eğitilmiştir."
84
+ "\n\n**Mixed-cross-encoder mimarisi:** metin1, metin2, distance ve numerik özellikler ile eğitilmiştir."
85
+ )
86
+
87
+ st.image("src/encoder_algorithm.png", caption="Encoder Algoritma Akışı", use_container_width=True)
88
+
89
+ st.header("Girdi String'leri")
90
+
91
+ stra_options = df_data['STRA'].unique()
92
+ str_a_input = st.selectbox("String A (STRA)", stra_options)
93
+
94
+ filtered_strb_options = df_data[df_data['STRA'] == str_a_input]['STRB'].unique()
95
+ str_b_input = st.selectbox("String B (STRB)", filtered_strb_options)
96
+
97
+ if st.button("Benzerliği Hesapla", type="primary"):
98
+ if not str_a_input or not str_b_input:
99
+ st.error("Lütfen her iki string alanını da seçin.")
100
+ else:
101
+ with st.spinner("Tahminler yapılıyor..."):
102
+ selected_row = df_data[(df_data['STRA'] == str_a_input) & (df_data['STRB'] == str_b_input)]
103
+ if not selected_row.empty:
104
+ numerical_features_for_prediction = selected_row[numerical_feature_cols].iloc[0].values
105
+ else:
106
+ st.error("Seçilen string'lere ait veri bulunamadı. Lütfen farklı seçimler yapın.")
107
+ st.stop()
108
+
109
+ cross_encoder_distance_score = predict(cross_encoder_model, tokenizer, str_a_input, str_b_input)
110
+ cross_encoder_similarity_score = 1 - cross_encoder_distance_score
111
+
112
+ mixed_cross_encoder_distance_score = predict(mixed_cross_encoder_model, tokenizer, str_a_input, str_b_input, numerical_features_for_prediction)
113
+ mixed_cross_encoder_similarity_score = 1 - mixed_cross_encoder_distance_score
114
+
115
+ actual_row = df_data[(df_data['STRA'] == str_a_input) & (df_data['STRB'] == str_b_input)]
116
+ if not actual_row.empty:
117
+ actual_distance = actual_row[LABEL_COL].iloc[0]
118
+ actual_similarity = 1 - actual_distance
119
+ else:
120
+ actual_distance = np.nan
121
+ actual_similarity = np.nan
122
+
123
+ st.subheader("Karşılaştırmalı Sonuçlar")
124
+
125
+ results_data = {
126
+ "Özellik": ["Tahmin Edilen Benzerlik", "Gerçek Benzerlik", "Tahmin Edilen Mesafe", "Gerçek Mesafe", "Karar"],
127
+ "Cross-Encoder Model": [
128
+ f"{cross_encoder_similarity_score:.4f}",
129
+ f"{actual_similarity:.4f}" if not np.isnan(actual_similarity) else "N/A",
130
+ f"{cross_encoder_distance_score:.4f}",
131
+ f"{actual_distance:.4f}" if not np.isnan(actual_distance) else "N/A",
132
+ "BENZER" if cross_encoder_similarity_score > 0.5 else "BENZER DEĞİL"
133
+ ],
134
+ "Mixed Cross-Encoder Model": [
135
+ f"{mixed_cross_encoder_similarity_score:.4f}",
136
+ f"{actual_similarity:.4f}" if not np.isnan(actual_similarity) else "N/A",
137
+ f"{mixed_cross_encoder_distance_score:.4f}",
138
+ f"{actual_distance:.4f}" if not np.isnan(actual_distance) else "N/A",
139
+ "BENZER" if mixed_cross_encoder_similarity_score > 0.5 else "BENZER DEĞİL"
140
+ ]
141
+ }
142
+ results_df = pd.DataFrame(results_data).set_index("Özellik")
143
+ st.dataframe(results_df)
144
+
145
+ st.markdown("---")
146
+ st.markdown(f"**Cross-Encoder Model Kararı:** `{str_a_input}` ve `{str_b_input}` kelimeleri **:{'blue' if cross_encoder_similarity_score > 0.5 else 'red'}[{'BENZER' if cross_encoder_similarity_score > 0.5 else 'BENZER DEĞİL'}]**.")
147
+ st.markdown(f"**Mixed Cross-Encoder Model Kararı:** `{str_a_input}` ve `{str_b_input}` kelimeleri **:{'blue' if mixed_cross_encoder_similarity_score > 0.5 else 'red'}[{'BENZER' if mixed_cross_encoder_similarity_score > 0.5 else 'BENZER DEĞİL'}]**.")