Spaces:
Sleeping
Sleeping
Update modules/extractive.py
Browse files- modules/extractive.py +148 -19
modules/extractive.py
CHANGED
|
@@ -1,47 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import numpy as np
|
|
|
|
|
|
|
| 2 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 3 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 4 |
from summarizer import Summarizer
|
| 5 |
-
import networkx as nx
|
| 6 |
|
| 7 |
|
| 8 |
class TFIDFSummarizer:
|
|
|
|
|
|
|
| 9 |
@staticmethod
|
| 10 |
-
def summarize(sentences, preprocessed_sentences, num_sentences):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
vectorizer = TfidfVectorizer()
|
| 12 |
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
|
| 13 |
-
|
| 14 |
-
ranked_indices = np.argsort(
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
class TextRankSummarizer:
|
|
|
|
|
|
|
| 19 |
@staticmethod
|
| 20 |
-
def summarize(sentences, preprocessed_sentences, num_sentences):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
vectorizer = TfidfVectorizer()
|
| 22 |
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
|
| 23 |
similarity_matrix = cosine_similarity(tfidf_matrix)
|
| 24 |
nx_graph = nx.from_numpy_array(similarity_matrix)
|
| 25 |
scores = nx.pagerank(nx_graph)
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
class CombinedSummarizer:
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
)
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
class BERTSummarizer:
|
|
|
|
|
|
|
| 43 |
def __init__(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
self.model = Summarizer()
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# abstractive.py
|
| 2 |
+
|
| 3 |
+
"""
|
| 4 |
+
Módulo de resúmenes 'abstractive.py'
|
| 5 |
+
|
| 6 |
+
Contiene implementaciones de diferentes técnicas de resumen de texto:
|
| 7 |
+
- TF-IDF Summarizer
|
| 8 |
+
- TextRank Summarizer
|
| 9 |
+
- Combined Summarizer (que combina TF-IDF y TextRank para extraer palabras clave)
|
| 10 |
+
- BERT Summarizer (extractivo basado en un modelo BERT preentrenado)
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
import numpy as np
|
| 14 |
+
import networkx as nx
|
| 15 |
+
from typing import List
|
| 16 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 17 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 18 |
from summarizer import Summarizer
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
class TFIDFSummarizer:
|
| 22 |
+
"""Genera resúmenes usando el modelo TF-IDF."""
|
| 23 |
+
|
| 24 |
@staticmethod
|
| 25 |
+
def summarize(sentences: List[str], preprocessed_sentences: List[str], num_sentences: int = 1) -> str:
|
| 26 |
+
"""
|
| 27 |
+
Genera un resumen basado en TF-IDF seleccionando las oraciones mejor puntuadas.
|
| 28 |
+
|
| 29 |
+
:param sentences: Lista de oraciones originales (sin procesar).
|
| 30 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas (por ejemplo, tokenizadas o normalizadas).
|
| 31 |
+
:param num_sentences: Número de oraciones a devolver en el resumen.
|
| 32 |
+
:return: Un string que contiene el resumen formado por las oraciones más relevantes.
|
| 33 |
+
"""
|
| 34 |
vectorizer = TfidfVectorizer()
|
| 35 |
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
|
| 36 |
+
sentence_scores = np.sum(tfidf_matrix.toarray(), axis=1)
|
| 37 |
+
ranked_indices = np.argsort(sentence_scores)[::-1]
|
| 38 |
+
selected = [sentences[i] for i in ranked_indices[:num_sentences]]
|
| 39 |
+
return ' '.join(selected)
|
| 40 |
|
| 41 |
|
| 42 |
class TextRankSummarizer:
|
| 43 |
+
"""Genera resúmenes usando el algoritmo TextRank."""
|
| 44 |
+
|
| 45 |
@staticmethod
|
| 46 |
+
def summarize(sentences: List[str], preprocessed_sentences: List[str], num_sentences: int = 1) -> str:
|
| 47 |
+
"""
|
| 48 |
+
Genera un resumen usando el algoritmo de grafos TextRank.
|
| 49 |
+
|
| 50 |
+
:param sentences: Lista de oraciones originales.
|
| 51 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas.
|
| 52 |
+
:param num_sentences: Número de oraciones a devolver en el resumen.
|
| 53 |
+
:return: Un string que contiene el resumen.
|
| 54 |
+
"""
|
| 55 |
vectorizer = TfidfVectorizer()
|
| 56 |
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
|
| 57 |
similarity_matrix = cosine_similarity(tfidf_matrix)
|
| 58 |
nx_graph = nx.from_numpy_array(similarity_matrix)
|
| 59 |
scores = nx.pagerank(nx_graph)
|
| 60 |
+
# Ordena los nodos (oraciones) por puntaje descendente
|
| 61 |
+
ranked_indices = sorted(((scores[node], node) for node in nx_graph.nodes), reverse=True)
|
| 62 |
+
selected = [sentences[i] for _, i in ranked_indices[:num_sentences]]
|
| 63 |
+
return ' '.join(selected)
|
| 64 |
|
| 65 |
|
| 66 |
class CombinedSummarizer:
|
| 67 |
+
"""Genera resúmenes combinando palabras clave TF-IDF y TextRank."""
|
| 68 |
+
|
| 69 |
+
def __init__(self, top_n_keywords: int = 10):
|
| 70 |
+
"""
|
| 71 |
+
:param top_n_keywords: Número de palabras clave a extraer de cada método (TF-IDF y TextRank).
|
| 72 |
+
"""
|
| 73 |
+
self.top_n_keywords = top_n_keywords
|
| 74 |
+
|
| 75 |
+
def extract_keywords_tfidf(self, preprocessed_sentences: List[str]) -> List[str]:
|
| 76 |
+
"""
|
| 77 |
+
Extrae palabras clave basadas en TF-IDF.
|
| 78 |
+
|
| 79 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas.
|
| 80 |
+
:return: Lista con las palabras clave más relevantes según TF-IDF.
|
| 81 |
+
"""
|
| 82 |
+
vectorizer = TfidfVectorizer()
|
| 83 |
+
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
|
| 84 |
+
tfidf_scores = zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray().sum(axis=0))
|
| 85 |
+
sorted_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
|
| 86 |
+
return [word for word, _ in sorted_scores[:self.top_n_keywords]]
|
| 87 |
+
|
| 88 |
+
def extract_keywords_textrank(self, preprocessed_sentences: List[str]) -> List[str]:
|
| 89 |
+
"""
|
| 90 |
+
Extrae palabras clave basadas en TextRank (a través de la co-ocurrencia de palabras).
|
| 91 |
+
|
| 92 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas.
|
| 93 |
+
:return: Lista con las palabras clave más relevantes según TextRank.
|
| 94 |
+
"""
|
| 95 |
+
words = ' '.join(preprocessed_sentences).split()
|
| 96 |
+
co_occurrence_graph = nx.Graph()
|
| 97 |
+
for i in range(len(words) - 1):
|
| 98 |
+
word_pair = (words[i], words[i + 1])
|
| 99 |
+
if co_occurrence_graph.has_edge(*word_pair):
|
| 100 |
+
co_occurrence_graph[word_pair[0]][word_pair[1]]['weight'] += 1
|
| 101 |
+
else:
|
| 102 |
+
co_occurrence_graph.add_edge(word_pair[0], word_pair[1], weight=1)
|
| 103 |
+
|
| 104 |
+
ranks = nx.pagerank(co_occurrence_graph, weight='weight')
|
| 105 |
+
sorted_ranks = sorted(ranks.items(), key=lambda x: x[1], reverse=True)
|
| 106 |
+
return [word for word, _ in sorted_ranks[:self.top_n_keywords]]
|
| 107 |
+
|
| 108 |
+
def combined_keywords(self, preprocessed_sentences: List[str]) -> List[str]:
|
| 109 |
+
"""
|
| 110 |
+
Combina las palabras clave obtenidas tanto por TF-IDF como por TextRank
|
| 111 |
+
y devuelve la intersección de ambas listas.
|
| 112 |
+
|
| 113 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas.
|
| 114 |
+
:return: Lista con las palabras clave en común (intersección).
|
| 115 |
+
"""
|
| 116 |
+
tfidf_keywords = self.extract_keywords_tfidf(preprocessed_sentences)
|
| 117 |
+
textrank_keywords = self.extract_keywords_textrank(preprocessed_sentences)
|
| 118 |
+
return list(set(tfidf_keywords) & set(textrank_keywords))
|
| 119 |
+
|
| 120 |
+
def summarize(self, sentences: List[str], preprocessed_sentences: List[str], num_sentences: int = 1) -> str:
|
| 121 |
+
"""
|
| 122 |
+
Genera un resumen basado en la frecuencia de palabras clave combinadas (TF-IDF & TextRank).
|
| 123 |
+
|
| 124 |
+
:param sentences: Lista de oraciones originales.
|
| 125 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas.
|
| 126 |
+
:param num_sentences: Número de oraciones a devolver en el resumen.
|
| 127 |
+
:return: Un string con las oraciones más relevantes.
|
| 128 |
+
"""
|
| 129 |
+
keywords = self.combined_keywords(preprocessed_sentences)
|
| 130 |
+
sentence_scores = []
|
| 131 |
+
for i, sentence in enumerate(preprocessed_sentences):
|
| 132 |
+
score = sum(1 for word in sentence.split() if word in keywords)
|
| 133 |
+
sentence_scores.append((score, i))
|
| 134 |
+
# Ordena las oraciones por la cantidad de palabras clave presentes
|
| 135 |
+
ranked_sentences = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
|
| 136 |
+
selected = [sentences[i] for _, i in ranked_sentences[:num_sentences]]
|
| 137 |
+
return ' '.join(selected)
|
| 138 |
|
| 139 |
|
| 140 |
class BERTSummarizer:
|
| 141 |
+
"""Genera resúmenes usando un modelo BERT extractivo preentrenado."""
|
| 142 |
+
|
| 143 |
def __init__(self):
|
| 144 |
+
"""
|
| 145 |
+
Inicializa el modelo BERT extractivo a través de la clase Summarizer.
|
| 146 |
+
Asegúrate de instalar e importar correctamente la librería 'bert-extractive-summarizer'.
|
| 147 |
+
"""
|
| 148 |
self.model = Summarizer()
|
| 149 |
|
| 150 |
+
@staticmethod
|
| 151 |
+
def summarize(sentences: List[str], preprocessed_sentences: List[str], num_sentences: int = 1) -> str:
|
| 152 |
+
"""
|
| 153 |
+
Genera un resumen extractivo usando un modelo BERT preentrenado basado en oraciones.
|
| 154 |
+
|
| 155 |
+
:param sentences: Lista de oraciones originales (sin procesar).
|
| 156 |
+
:param preprocessed_sentences: Lista de oraciones preprocesadas.
|
| 157 |
+
:param num_sentences: Número de oraciones a devolver en el resumen.
|
| 158 |
+
:return: Un string que contiene el resumen formado por las oraciones más relevantes.
|
| 159 |
+
"""
|
| 160 |
+
# Unimos las oraciones preprocesadas en un texto completo para resumir
|
| 161 |
+
text = ' '.join(preprocessed_sentences)
|
| 162 |
+
summarizer = Summarizer()
|
| 163 |
+
|
| 164 |
+
# Usamos el modelo para generar un resumen con el número de oraciones solicitado
|
| 165 |
+
summarized_text = summarizer(text, num_sentences=num_sentences)
|
| 166 |
+
|
| 167 |
+
# Dividimos el resumen generado para seleccionar las oraciones originales correspondientes
|
| 168 |
+
summarized_sentences = summarized_text.split('. ')
|
| 169 |
+
selected = []
|
| 170 |
+
for summarized_sentence in summarized_sentences:
|
| 171 |
+
for sentence in sentences:
|
| 172 |
+
if summarized_sentence.strip() in sentence:
|
| 173 |
+
selected.append(sentence)
|
| 174 |
+
break
|
| 175 |
+
return ' '.join(selected[:num_sentences])
|
| 176 |
+
|