Spaces:
Building
Building
| # Load packages | |
| import spacy | |
| import glob | |
| import pandas as pd | |
| # Initialize spaCy model | |
| nlp = spacy.load("en_core_web_sm") | |
| # Define dependency relations for collocation extraction | |
| # Common patterns: adj-noun, verb-object, verb-adverb, noun-prep-noun | |
| COLLOCATION_PATTERNS = [ | |
| ("amod", "ADJ", "NOUN"), # adjective modifier (e.g., "big house") | |
| ("dobj", "VERB", "NOUN"), # direct object (e.g., "eat food") | |
| ("advmod", "VERB", "ADV"), # adverb modifier (e.g., "run quickly") | |
| ("nmod", "NOUN", "NOUN"), # noun modifier (e.g., "cup of tea") | |
| ("compound", "NOUN", "NOUN"), # compound nouns (e.g., "computer science") | |
| ] | |
| # Alternative approach: Class-based design (recommended) | |
| class CollocationExtractor: | |
| def __init__(self, patterns=COLLOCATION_PATTERNS): | |
| self.nlp = spacy.load("en_core_web_sm") | |
| self.patterns = patterns | |
| self.results = { | |
| "corpus_size": 0, | |
| "unigram": {}, | |
| "bigram": {} | |
| } | |
| def process_file(self, filepath): | |
| """Process a single file and update internal results""" | |
| # Load text | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| text = f.read() | |
| # Parse with spaCy | |
| doc = self.nlp(text) | |
| # Extract collocations | |
| collocations = self._extract_collocations(doc) | |
| # Update results internally | |
| self._update_results(text, collocations) | |
| def _extract_collocations(self, doc): | |
| """Extract collocations based on dependency patterns""" | |
| collocations = [] | |
| for token in doc: | |
| for dep_rel, pos1, pos2 in self.patterns: | |
| if token.dep_ == dep_rel: | |
| if dep_rel in ["amod", "advmod", "compound"]: | |
| if token.pos_ == pos1 and token.head.pos_ == pos2: | |
| collocations.append((token.text.lower(), token.head.text.lower(), dep_rel)) | |
| else: | |
| if token.head.pos_ == pos1 and token.pos_ == pos2: | |
| collocations.append((token.head.text.lower(), token.text.lower(), dep_rel)) | |
| return collocations | |
| def _update_results(self, text, collocations): | |
| """Update internal results - no need to pass results around""" | |
| # Update corpus size | |
| tokens = text.split() | |
| self.results["corpus_size"] += len(tokens) | |
| # Update unigram frequencies | |
| for token in tokens: | |
| token_lower = token.lower() | |
| self.results["unigram"][token_lower] = self.results["unigram"].get(token_lower, 0) + 1 | |
| # Update bigram frequencies | |
| for word1, word2, dep_rel in collocations: | |
| bigram_key = f"{word1}_{word2}_{dep_rel}" | |
| if bigram_key not in self.results["bigram"]: | |
| self.results["bigram"][bigram_key] = { | |
| "word1": word1, | |
| "word2": word2, | |
| "dep_rel": dep_rel, | |
| "freq": 0 | |
| } | |
| self.results["bigram"][bigram_key]["freq"] += 1 | |
| def process_corpus(self, files): | |
| """Process multiple files""" | |
| for file in files: | |
| self.process_file(file) | |
| print(f"Processed: {file}") | |
| def get_top_collocations(self, n=20, dep_rel=None): | |
| """Get top n collocations, optionally filtered by dependency relation""" | |
| bigrams = self.results["bigram"] | |
| if dep_rel: | |
| bigrams = {k: v for k, v in bigrams.items() if v["dep_rel"] == dep_rel} | |
| sorted_bigrams = sorted(bigrams.items(), key=lambda x: x[1]["freq"], reverse=True) | |
| return sorted_bigrams[:n] | |
| # Usage example | |
| extractor = CollocationExtractor() | |
| extractor.process_corpus(CORPUS_FILES[:5]) | |
| print(f"\nCorpus size: {extractor.results['corpus_size']} tokens") | |
| print(f"Unique words: {len(extractor.results['unigram'])}") | |
| print(f"Unique collocations: {len(extractor.results['bigram'])}") | |
| # Show top collocations | |
| print("\nTop 10 collocations:") | |
| for key, info in extractor.get_top_collocations(10): | |
| print(f"{info['word1']} {info['word2']} ({info['dep_rel']}): {info['freq']}") |