Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.svm import LinearSVC | |
| from sklearn.model_selection import cross_val_score | |
| from sklearn.metrics.pairwise import cosine_distances | |
| from pypdf import PdfReader | |
| import docx | |
| import re | |
| import os | |
| def extract_text(filepath): | |
| ext = os.path.splitext(filepath)[1].lower() | |
| text = "" | |
| try: | |
| if ext == '.txt': | |
| with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: | |
| text = f.read() | |
| elif ext == '.pdf': | |
| reader = PdfReader(filepath) | |
| text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()]) | |
| elif ext == '.docx': | |
| doc = docx.Document(filepath) | |
| text = " ".join([paragraph.text for paragraph in doc.paragraphs]) | |
| except Exception as e: | |
| pass | |
| return text | |
| def chunk_text(text, chunk_size=1000): | |
| text = re.sub(r'[^\w\s\.,;:\?!]', '', text.lower()) | |
| words = text.split() | |
| chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size) if len(words[i:i + chunk_size]) > (chunk_size * 0.5)] | |
| return chunks if chunks else [" ".join(words)] | |
| def unmasking_algorithm(chunks_a, chunks_b, iterations=10, drop_features=3): | |
| labels = [0] * len(chunks_a) + [1] * len(chunks_b) | |
| all_chunks = chunks_a + chunks_b | |
| vectorizer = TfidfVectorizer(max_features=500, use_idf=False) | |
| X = vectorizer.fit_transform(all_chunks).toarray() | |
| y = np.array(labels) | |
| accuracies = [] | |
| clf = LinearSVC(dual="auto", max_iter=2000) | |
| for _ in range(iterations): | |
| # Validación cruzada (3-fold para textos medianos) | |
| cv_folds = min(3, len(all_chunks)) | |
| if cv_folds < 2: | |
| return [0] * iterations | |
| scores = cross_val_score(clf, X, y, cv=cv_folds) | |
| accuracies.append(scores.mean()) | |
| # Entrenar en todo el conjunto para extraer los pesos de las características | |
| clf.fit(X, y) | |
| weights = np.abs(clf.coef_[0]) | |
| # Eliminar las características más discriminativas (poner a cero) | |
| top_indices = weights.argsort()[-drop_features:] | |
| for idx in top_indices: | |
| X[:, idx] = 0 | |
| return accuracies | |
| def bootstrap_distance(features_a, features_b, n_iterations=1000): | |
| distances = [] | |
| n_features = features_a.shape[0] | |
| for _ in range(n_iterations): | |
| indices = np.random.choice(n_features, n_features, replace=True) | |
| sample_a = features_a[indices].reshape(1, -1) | |
| sample_b = features_b[indices].reshape(1, -1) | |
| # Evitar vectores nulos por el remuestreo | |
| if np.sum(sample_a) == 0 or np.sum(sample_b) == 0: | |
| continue | |
| dist = cosine_distances(sample_a, sample_b)[0][0] | |
| distances.append(dist) | |
| if not distances: | |
| return 0, 0, 0 | |
| ci_lower = np.percentile(distances, 2.5) | |
| ci_upper = np.percentile(distances, 97.5) | |
| mean_dist = np.mean(distances) | |
| return mean_dist, ci_lower, ci_upper | |
| def process_forensic_analysis(files): | |
| if not files or len(files) != 2: | |
| return "El análisis profundo requiere exactamente dos documentos para la matriz comparativa.", None | |
| doc_a_name = os.path.basename(files[0].name) | |
| doc_b_name = os.path.basename(files[1].name) | |
| text_a = extract_text(files[0].name) | |
| text_b = extract_text(files[1].name) | |
| chunks_a = chunk_text(text_a) | |
| chunks_b = chunk_text(text_b) | |
| if len(chunks_a) < 2 or len(chunks_b) < 2: | |
| return "El volumen textual es insuficiente. Se requieren textos más extensos para garantizar la validez del remuestreo y la validación cruzada.", None | |
| # 1. Análisis de Desenmascaramiento (Unmasking) | |
| unmasking_curve = unmasking_algorithm(chunks_a, chunks_b) | |
| # Evaluar la degradación: si la precisión cae sustancialmente, es el mismo autor | |
| degradation = unmasking_curve[0] - unmasking_curve[-1] | |
| is_same_author_unmasking = degradation > 0.15 and unmasking_curve[-1] < 0.65 | |
| # 2. Bootstrapping sobre la Delta del Coseno (frecuencias léxicas) | |
| vectorizer = TfidfVectorizer(max_features=250, analyzer='char', ngram_range=(3, 4)) | |
| all_text_a = " ".join(chunks_a) | |
| all_text_b = " ".join(chunks_b) | |
| tfidf_matrix = vectorizer.fit_transform([all_text_a, all_text_b]).toarray() | |
| mean_dist, ci_lower, ci_upper = bootstrap_distance(tfidf_matrix[0], tfidf_matrix[1]) | |
| # 3. Construcción del Veredicto | |
| report = f"### Evaluación Pericial: {doc_a_name} vs {doc_b_name}\n\n" | |
| if is_same_author_unmasking and mean_dist < 0.4: | |
| report += "El análisis determina que los documentos pertenecen al **mismo autor**.\n\n" | |
| report += "La curva de desenmascaramiento demuestra que las diferencias iniciales entre los textos eran puramente superficiales o temáticas. Al eliminar los marcadores léxicos más evidentes, el clasificador pierde su capacidad de distinguir los fragmentos, confirmando una arquitectura mental subyacente idéntica." | |
| else: | |
| report += "El análisis determina que los documentos pertenecen a **diferentes autores**.\n\n" | |
| report += "El algoritmo de desenmascaramiento mantiene una alta precisión predictiva incluso tras suprimir múltiples capas de características discriminativas. Esto indica que la divergencia estilística es profunda, estructural y está anclada en hábitos sintácticos diametralmente opuestos." | |
| report += f"\n\n#### Datos estadísticos\n" | |
| report += f"La distancia de la delta del coseno (tras 1000 iteraciones de *bootstrapping*) se sitúa en **{mean_dist:.3f}**. " | |
| report += f"Existe un 95 % de certeza estadística de que la divergencia real fluctúa en el intervalo de **[{ci_lower:.3f}, {ci_upper:.3f}]**." | |
| # 4. Visualización Dual | |
| fig = make_subplots(rows=1, cols=2, subplot_titles=("Curva de Desenmascaramiento", "Distribución de la Distancia (Bootstrapping)")) | |
| # Gráfico 1: Unmasking | |
| fig.add_trace(go.Scatter(y=unmasking_curve, mode='lines+markers', name='Precisión SVM', line=dict(color='firebrick', width=2)), row=1, col=1) | |
| fig.update_xaxes(title_text="Iteraciones (Eliminación de características)", row=1, col=1) | |
| fig.update_yaxes(title_text="Precisión de Validación Cruzada", range=[0.3, 1.05], row=1, col=1) | |
| # Gráfico 2: Bootstrapping (Campana aproximada) | |
| x_val = np.linspace(max(0, mean_dist - 0.2), mean_dist + 0.2, 100) | |
| y_val = np.exp(-0.5 * ((x_val - mean_dist) / ((ci_upper - ci_lower) / 4))**2) | |
| fig.add_trace(go.Scatter(x=x_val, y=y_val, fill='tozeroy', name='Probabilidad', line=dict(color='royalblue')), row=1, col=2) | |
| # Marcar los límites de confianza | |
| fig.add_vline(x=ci_lower, line_dash="dash", line_color="black", row=1, col=2) | |
| fig.add_vline(x=ci_upper, line_dash="dash", line_color="black", row=1, col=2) | |
| fig.update_xaxes(title_text="Distancia del Coseno", row=1, col=2) | |
| fig.update_layout(template="plotly_white", showlegend=False, height=450) | |
| return report, fig | |
| with gr.Blocks(theme=gr.themes.Base()) as demo: | |
| gr.Markdown("# Sistema de Identificación Estilométrica") | |
| gr.Markdown("Análisis comparativo mediante algoritmo de desenmascaramiento transversal e intervalos de confianza generados por *bootstrapping* estadístico.") | |
| with gr.Row(): | |
| file_input = gr.File(file_count="multiple", label="Sube exactamente DOS documentos (.txt, .pdf, .docx)", type="filepath") | |
| analyze_btn = gr.Button("Generar dictamen", variant="primary") | |
| output_report = gr.Markdown() | |
| output_plot = gr.Plot() | |
| analyze_btn.click(fn=process_forensic_analysis, inputs=file_input, outputs=[output_report, output_plot]) | |
| if __name__ == "__main__": | |
| demo.launch() |