simple-text-analyzer / text_analyzer /collocation_extractor.py
egumasa's picture
construction extractor
5ac114d
# Load packages
import spacy
import glob
import pandas as pd
# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")
# Define dependency relations for collocation extraction
# Common patterns: adj-noun, verb-object, verb-adverb, noun-prep-noun
COLLOCATION_PATTERNS = [
("amod", "ADJ", "NOUN"), # adjective modifier (e.g., "big house")
("dobj", "VERB", "NOUN"), # direct object (e.g., "eat food")
("advmod", "VERB", "ADV"), # adverb modifier (e.g., "run quickly")
("nmod", "NOUN", "NOUN"), # noun modifier (e.g., "cup of tea")
("compound", "NOUN", "NOUN"), # compound nouns (e.g., "computer science")
]
# Alternative approach: Class-based design (recommended)
class CollocationExtractor:
def __init__(self, patterns=COLLOCATION_PATTERNS):
self.nlp = spacy.load("en_core_web_sm")
self.patterns = patterns
self.results = {
"corpus_size": 0,
"unigram": {},
"bigram": {}
}
def process_file(self, filepath):
"""Process a single file and update internal results"""
# Load text
with open(filepath, 'r', encoding='utf-8') as f:
text = f.read()
# Parse with spaCy
doc = self.nlp(text)
# Extract collocations
collocations = self._extract_collocations(doc)
# Update results internally
self._update_results(text, collocations)
def _extract_collocations(self, doc):
"""Extract collocations based on dependency patterns"""
collocations = []
for token in doc:
for dep_rel, pos1, pos2 in self.patterns:
if token.dep_ == dep_rel:
if dep_rel in ["amod", "advmod", "compound"]:
if token.pos_ == pos1 and token.head.pos_ == pos2:
collocations.append((token.text.lower(), token.head.text.lower(), dep_rel))
else:
if token.head.pos_ == pos1 and token.pos_ == pos2:
collocations.append((token.head.text.lower(), token.text.lower(), dep_rel))
return collocations
def _update_results(self, text, collocations):
"""Update internal results - no need to pass results around"""
# Update corpus size
tokens = text.split()
self.results["corpus_size"] += len(tokens)
# Update unigram frequencies
for token in tokens:
token_lower = token.lower()
self.results["unigram"][token_lower] = self.results["unigram"].get(token_lower, 0) + 1
# Update bigram frequencies
for word1, word2, dep_rel in collocations:
bigram_key = f"{word1}_{word2}_{dep_rel}"
if bigram_key not in self.results["bigram"]:
self.results["bigram"][bigram_key] = {
"word1": word1,
"word2": word2,
"dep_rel": dep_rel,
"freq": 0
}
self.results["bigram"][bigram_key]["freq"] += 1
def process_corpus(self, files):
"""Process multiple files"""
for file in files:
self.process_file(file)
print(f"Processed: {file}")
def get_top_collocations(self, n=20, dep_rel=None):
"""Get top n collocations, optionally filtered by dependency relation"""
bigrams = self.results["bigram"]
if dep_rel:
bigrams = {k: v for k, v in bigrams.items() if v["dep_rel"] == dep_rel}
sorted_bigrams = sorted(bigrams.items(), key=lambda x: x[1]["freq"], reverse=True)
return sorted_bigrams[:n]
# Usage example
extractor = CollocationExtractor()
extractor.process_corpus(CORPUS_FILES[:5])
print(f"\nCorpus size: {extractor.results['corpus_size']} tokens")
print(f"Unique words: {len(extractor.results['unigram'])}")
print(f"Unique collocations: {len(extractor.results['bigram'])}")
# Show top collocations
print("\nTop 10 collocations:")
for key, info in extractor.get_top_collocations(10):
print(f"{info['word1']} {info['word2']} ({info['dep_rel']}): {info['freq']}")