File size: 4,232 Bytes
5ac114d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Load packages

import spacy
import glob
import pandas as pd

# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

# Define dependency relations for collocation extraction
# Common patterns: adj-noun, verb-object, verb-adverb, noun-prep-noun
COLLOCATION_PATTERNS = [
    ("amod", "ADJ", "NOUN"),     # adjective modifier (e.g., "big house")
    ("dobj", "VERB", "NOUN"),    # direct object (e.g., "eat food")
    ("advmod", "VERB", "ADV"),   # adverb modifier (e.g., "run quickly")
    ("nmod", "NOUN", "NOUN"),    # noun modifier (e.g., "cup of tea")
    ("compound", "NOUN", "NOUN"), # compound nouns (e.g., "computer science")
]


# Alternative approach: Class-based design (recommended)

class CollocationExtractor:
    def __init__(self, patterns=COLLOCATION_PATTERNS):
        self.nlp = spacy.load("en_core_web_sm")
        self.patterns = patterns
        self.results = {
            "corpus_size": 0,
            "unigram": {},
            "bigram": {}
        }
    
    def process_file(self, filepath):
        """Process a single file and update internal results"""
        # Load text
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # Parse with spaCy
        doc = self.nlp(text)
        
        # Extract collocations
        collocations = self._extract_collocations(doc)
        
        # Update results internally
        self._update_results(text, collocations)
        
    def _extract_collocations(self, doc):
        """Extract collocations based on dependency patterns"""
        collocations = []
        
        for token in doc:
            for dep_rel, pos1, pos2 in self.patterns:
                if token.dep_ == dep_rel:
                    if dep_rel in ["amod", "advmod", "compound"]:
                        if token.pos_ == pos1 and token.head.pos_ == pos2:
                            collocations.append((token.text.lower(), token.head.text.lower(), dep_rel))
                    else:
                        if token.head.pos_ == pos1 and token.pos_ == pos2:
                            collocations.append((token.head.text.lower(), token.text.lower(), dep_rel))
        
        return collocations
    
    def _update_results(self, text, collocations):
        """Update internal results - no need to pass results around"""
        # Update corpus size
        tokens = text.split()
        self.results["corpus_size"] += len(tokens)
        
        # Update unigram frequencies
        for token in tokens:
            token_lower = token.lower()
            self.results["unigram"][token_lower] = self.results["unigram"].get(token_lower, 0) + 1
        
        # Update bigram frequencies
        for word1, word2, dep_rel in collocations:
            bigram_key = f"{word1}_{word2}_{dep_rel}"
            if bigram_key not in self.results["bigram"]:
                self.results["bigram"][bigram_key] = {
                    "word1": word1,
                    "word2": word2,
                    "dep_rel": dep_rel,
                    "freq": 0
                }
            self.results["bigram"][bigram_key]["freq"] += 1
    
    def process_corpus(self, files):
        """Process multiple files"""
        for file in files:
            self.process_file(file)
            print(f"Processed: {file}")
    
    def get_top_collocations(self, n=20, dep_rel=None):
        """Get top n collocations, optionally filtered by dependency relation"""
        bigrams = self.results["bigram"]
        
        if dep_rel:
            bigrams = {k: v for k, v in bigrams.items() if v["dep_rel"] == dep_rel}
        
        sorted_bigrams = sorted(bigrams.items(), key=lambda x: x[1]["freq"], reverse=True)
        return sorted_bigrams[:n]

# Usage example
extractor = CollocationExtractor()
extractor.process_corpus(CORPUS_FILES[:5])

print(f"\nCorpus size: {extractor.results['corpus_size']} tokens")
print(f"Unique words: {len(extractor.results['unigram'])}")
print(f"Unique collocations: {len(extractor.results['bigram'])}")

# Show top collocations
print("\nTop 10 collocations:")
for key, info in extractor.get_top_collocations(10):
    print(f"{info['word1']} {info['word2']} ({info['dep_rel']}): {info['freq']}")