Rajor78 commited on
Commit
9e46923
verified
1 Parent(s): 8a5dbba

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +168 -0
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from sentence_transformers import SentenceTransformer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import nltk
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.corpus import stopwords
8
+ from nltk.stem import SnowballStemmer
9
+ from datetime import datetime
10
+ import re
11
+
12
+ # Descargar recursos de NLTK
13
+ nltk.download('punkt')
14
+ nltk.download('stopwords')
15
+
16
+ class LiteralEncoder:
17
+ def __init__(self):
18
+ # Modelo de embeddings multiling眉e
19
+ self.model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
20
+ self.stemmer = SnowballStemmer('spanish')
21
+ self.stop_words = set(stopwords.words('spanish'))
22
+ self.literal_to_codes = {}
23
+ self.embeddings = {}
24
+
25
+ def preprocess_literal(self, text):
26
+ """Preprocesa el literal para mejor comparaci贸n"""
27
+ text = str(text).lower().strip()
28
+ text = re.sub(r'[^\w\s]', ' ', text)
29
+ tokens = word_tokenize(text)
30
+ tokens = [self.stemmer.stem(token) for token in tokens
31
+ if token not in self.stop_words]
32
+ return ' '.join(tokens)
33
+
34
+ def train(self, training_df):
35
+ """Entrena el codificador con los datos de ejemplo"""
36
+ # Procesar cada literal y sus c贸digos
37
+ for _, row in training_df.iterrows():
38
+ literal = str(row['B']).strip()
39
+ codes = str(row['C']).strip().split(';')
40
+ codes = [code.strip() for code in codes]
41
+
42
+ processed_literal = self.preprocess_literal(literal)
43
+ self.literal_to_codes[literal] = {
44
+ 'codes': codes,
45
+ 'processed': processed_literal
46
+ }
47
+
48
+ # Generar embeddings para todos los literales
49
+ processed_literals = [v['processed'] for v in self.literal_to_codes.values()]
50
+ all_embeddings = self.model.encode(processed_literals)
51
+
52
+ for (literal, data), embedding in zip(self.literal_to_codes.items(), all_embeddings):
53
+ self.literal_to_codes[literal]['embedding'] = embedding
54
+
55
+ def encode_literal(self, literal, threshold=0.7):
56
+ """Codifica un nuevo literal basado en similitud"""
57
+ processed = self.preprocess_literal(literal)
58
+ literal_embedding = self.model.encode([processed])[0]
59
+
60
+ best_similarity = 0
61
+ best_match = None
62
+ best_codes = []
63
+
64
+ for train_literal, data in self.literal_to_codes.items():
65
+ similarity = cosine_similarity(
66
+ [literal_embedding],
67
+ [data['embedding']]
68
+ )[0][0]
69
+
70
+ if similarity > best_similarity:
71
+ best_similarity = similarity
72
+ best_match = train_literal
73
+ best_codes = data['codes']
74
+
75
+ if best_similarity >= threshold:
76
+ return {
77
+ 'codes': best_codes,
78
+ 'similarity': best_similarity,
79
+ 'matched_literal': best_match
80
+ }
81
+ else:
82
+ return {
83
+ 'codes': [],
84
+ 'similarity': 0,
85
+ 'matched_literal': 'NO_MATCH'
86
+ }
87
+
88
+ def process_excel(training_file, new_file, confidence_threshold=0.7):
89
+ """Procesa los archivos Excel"""
90
+ try:
91
+ # Leer archivos
92
+ training_df = pd.read_excel(training_file.name)
93
+ new_df = pd.read_excel(new_file.name)
94
+
95
+ # Inicializar y entrenar el codificador
96
+ encoder = LiteralEncoder()
97
+ encoder.train(training_df)
98
+
99
+ # Preparar DataFrame de resultados
100
+ results_df = new_df.copy()
101
+ results_df['C贸digos_Asignados'] = ''
102
+ results_df['Literal_Original'] = ''
103
+ results_df['Score_Similitud'] = 0.0
104
+
105
+ # Codificar cada literal nuevo
106
+ for idx, row in results_df.iterrows():
107
+ literal = str(row['B'])
108
+ result = encoder.encode_literal(literal, confidence_threshold)
109
+
110
+ results_df.at[idx, 'C贸digos_Asignados'] = (
111
+ '; '.join(result['codes']) if result['codes']
112
+ else 'SIN_MATCH'
113
+ )
114
+ results_df.at[idx, 'Literal_Original'] = result['matched_literal']
115
+ results_df.at[idx, 'Score_Similitud'] = round(result['similarity'], 3)
116
+
117
+ # Generar estad铆sticas
118
+ total = len(results_df)
119
+ matched = len(results_df[results_df['C贸digos_Asignados'] != 'SIN_MATCH'])
120
+
121
+ stats_df = pd.DataFrame({
122
+ 'M茅trica': [
123
+ 'Total Literales',
124
+ 'Literales Codificados',
125
+ 'Sin Coincidencia',
126
+ 'Porcentaje 脡xito'
127
+ ],
128
+ 'Valor': [
129
+ total,
130
+ matched,
131
+ total - matched,
132
+ f"{(matched/total*100):.1f}%"
133
+ ]
134
+ })
135
+
136
+ # Guardar resultados
137
+ output_name = f"codificacion_literales_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
138
+
139
+ with pd.ExcelWriter(output_name) as writer:
140
+ results_df.to_excel(writer, sheet_name='Resultados', index=False)
141
+ stats_df.to_excel(writer, sheet_name='Resumen', index=False)
142
+ training_df.to_excel(writer, sheet_name='Datos_Training', index=False)
143
+
144
+ return output_name
145
+
146
+ except Exception as e:
147
+ return f"Error: {str(e)}"
148
+
149
+ # Interfaz Gradio
150
+ iface = gr.Interface(
151
+ fn=process_excel,
152
+ inputs=[
153
+ gr.File(label="Excel con literales de entrenamiento (B: literales, C: c贸digos)"),
154
+ gr.File(label="Excel con nuevos literales a codificar"),
155
+ gr.Slider(
156
+ minimum=0.0,
157
+ maximum=1.0,
158
+ value=0.7,
159
+ label="Umbral de confianza (0-1)"
160
+ )
161
+ ],
162
+ outputs=gr.File(label="Excel con resultados"),
163
+ title="Codificador Autom谩tico de Literales",
164
+ description="Codifica autom谩ticamente literales bas谩ndose en ejemplos previos. Los c贸digos m煤ltiples deben estar separados por punto y coma (;) en la columna C."
165
+ )
166
+
167
+ if __name__ == "__main__":
168
+ iface.launch()