Spaces:

AIdeaText
/

test2

Build error

App Files Files Community

AIdeaText commited on Jul 25, 2024

Commit

aee1800

verified ·

1 Parent(s): e90add8

Update modules/semantic_analysis.py

Browse files

Files changed (1) hide show

modules/semantic_analysis.py +27 -56

modules/semantic_analysis.py CHANGED Viewed

@@ -112,70 +112,41 @@ ENTITY_LABELS = {
 def count_pos(doc):
     return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT')
-import spacy
-import networkx as nx
-import matplotlib.pyplot as plt
-from collections import Counter
-# Mantén las definiciones de POS_COLORS y POS_TRANSLATIONS que ya tienes
-#############################################################################################################################
-def extract_entities(doc, lang):
-    entities = {label: [] for label in ENTITY_LABELS[lang].keys()}
-    for ent in doc.ents:
-        if ent.label_ == "PERSON":
-            entities[list(ENTITY_LABELS[lang].keys())[0]].append(ent.text)
-        elif ent.label_ in ["LOC", "GPE"]:
-            entities[list(ENTITY_LABELS[lang].keys())[2]].append(ent.text)
-        elif ent.label_ == "DATE":
-            entities[list(ENTITY_LABELS[lang].keys())[3]].append(ent.text)
-        else:
-            entities[list(ENTITY_LABELS[lang].keys())[1]].append(ent.text)
-    return entities
-#####################################################################################################################
-#def visualize_context_graph(doc, lang):
-#    G = nx.Graph()
-#    entities = extract_entities(doc, lang)
-#    color_map = ENTITY_LABELS[lang]
     # Add nodes
-#    for category, items in entities.items():
-#        for item in items:
-#            G.add_node(item, category=category)
     # Add edges
-#    for sent in doc.sents:
-#        sent_entities = [ent for ent in sent.ents if ent.text in G.nodes()]
-#        for i in range(len(sent_entities)):
-#            for j in range(i+1, len(sent_entities)):
-#                G.add_edge(sent_entities[i].text, sent_entities[j].text)
-    # Visualize
-#    plt.figure(figsize=(30, 22))  # Increased figure size
-#    pos = nx.spring_layout(G, k=0.7, iterations=50)  # Adjusted layout
-#    node_colors = [color_map[G.nodes[node]['category']] for node in G.nodes()]
-#    nx.draw(G, pos, node_color=node_colors, with_labels=True,
-#            node_size=10000,  # Increased node size
-#            font_size=18,  # Increased font size
-#            font_weight='bold',
-#            width=2,  # Increased edge width
-#            arrowsize=30)  # Increased arrow size
-    # Add a legend
-#    legend_elements = [plt.Rectangle((0,0),1,1,fc=color, edgecolor='none', label=category)
-#                       for category, color in color_map.items()]
-#    plt.legend(handles=legend_elements, loc='upper left', bbox_to_anchor=(1, 1), fontsize=16)  # Increased legend font size
-#    plt.title("Análisis del Contexto" if lang == 'es' else "Context Analysis" if lang == 'en' else "Analyse du Contexte", fontsize=24)  # Increased title font size
-#    plt.axis('off')
-#    return plt
 ############################################################################################################################################

 def count_pos(doc):
     return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT')
+#####################################################################################################################
+def create_semantic_graph(doc, lang):
+    G = nx.Graph()
+    word_freq = defaultdict(int)
+    lemma_to_word = {}
+    lemma_to_pos = {}
+    # Count frequencies of lemmas and map lemmas to their most common word form and POS
+    for token in doc:
+        if token.pos_ in ['NOUN', 'VERB']:
+            lemma = token.lemma_.lower()
+            word_freq[lemma] += 1
+            if lemma not in lemma_to_word or token.text.lower() == lemma:
+                lemma_to_word[lemma] = token.text
+            lemma_to_pos[lemma] = token.pos_
+    # Get top 20 most frequent lemmas
+    top_lemmas = [lemma for lemma, _ in sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]]
     # Add nodes
+    for lemma in top_lemmas:
+        word = lemma_to_word[lemma]
+        G.add_node(word, pos=lemma_to_pos[lemma])
     # Add edges
+    for token in doc:
+        if token.lemma_.lower() in top_lemmas:
+            if token.head.lemma_.lower() in top_lemmas:
+                source = lemma_to_word[token.lemma_.lower()]
+                target = lemma_to_word[token.head.lemma_.lower()]
+                if source != target:  # Avoid self-loops
+                    G.add_edge(source, target, label=token.dep_)
+    return G, word_freq
 ############################################################################################################################################