# Load packages import spacy import glob import pandas as pd # Initialize spaCy model nlp = spacy.load("en_core_web_sm") # Define dependency relations for collocation extraction # Common patterns: adj-noun, verb-object, verb-adverb, noun-prep-noun COLLOCATION_PATTERNS = [ ("amod", "ADJ", "NOUN"), # adjective modifier (e.g., "big house") ("dobj", "VERB", "NOUN"), # direct object (e.g., "eat food") ("advmod", "VERB", "ADV"), # adverb modifier (e.g., "run quickly") ("nmod", "NOUN", "NOUN"), # noun modifier (e.g., "cup of tea") ("compound", "NOUN", "NOUN"), # compound nouns (e.g., "computer science") ] # Alternative approach: Class-based design (recommended) class CollocationExtractor: def __init__(self, patterns=COLLOCATION_PATTERNS): self.nlp = spacy.load("en_core_web_sm") self.patterns = patterns self.results = { "corpus_size": 0, "unigram": {}, "bigram": {} } def process_file(self, filepath): """Process a single file and update internal results""" # Load text with open(filepath, 'r', encoding='utf-8') as f: text = f.read() # Parse with spaCy doc = self.nlp(text) # Extract collocations collocations = self._extract_collocations(doc) # Update results internally self._update_results(text, collocations) def _extract_collocations(self, doc): """Extract collocations based on dependency patterns""" collocations = [] for token in doc: for dep_rel, pos1, pos2 in self.patterns: if token.dep_ == dep_rel: if dep_rel in ["amod", "advmod", "compound"]: if token.pos_ == pos1 and token.head.pos_ == pos2: collocations.append((token.text.lower(), token.head.text.lower(), dep_rel)) else: if token.head.pos_ == pos1 and token.pos_ == pos2: collocations.append((token.head.text.lower(), token.text.lower(), dep_rel)) return collocations def _update_results(self, text, collocations): """Update internal results - no need to pass results around""" # Update corpus size tokens = text.split() self.results["corpus_size"] += len(tokens) # Update unigram frequencies for token in tokens: token_lower = token.lower() self.results["unigram"][token_lower] = self.results["unigram"].get(token_lower, 0) + 1 # Update bigram frequencies for word1, word2, dep_rel in collocations: bigram_key = f"{word1}_{word2}_{dep_rel}" if bigram_key not in self.results["bigram"]: self.results["bigram"][bigram_key] = { "word1": word1, "word2": word2, "dep_rel": dep_rel, "freq": 0 } self.results["bigram"][bigram_key]["freq"] += 1 def process_corpus(self, files): """Process multiple files""" for file in files: self.process_file(file) print(f"Processed: {file}") def get_top_collocations(self, n=20, dep_rel=None): """Get top n collocations, optionally filtered by dependency relation""" bigrams = self.results["bigram"] if dep_rel: bigrams = {k: v for k, v in bigrams.items() if v["dep_rel"] == dep_rel} sorted_bigrams = sorted(bigrams.items(), key=lambda x: x[1]["freq"], reverse=True) return sorted_bigrams[:n] # Usage example extractor = CollocationExtractor() extractor.process_corpus(CORPUS_FILES[:5]) print(f"\nCorpus size: {extractor.results['corpus_size']} tokens") print(f"Unique words: {len(extractor.results['unigram'])}") print(f"Unique collocations: {len(extractor.results['bigram'])}") # Show top collocations print("\nTop 10 collocations:") for key, info in extractor.get_top_collocations(10): print(f"{info['word1']} {info['word2']} ({info['dep_rel']}): {info['freq']}")