Spaces:

egumasa
/

simple-text-analyzer

Building

App Files Files Community

simple-text-analyzer / text_analyzer /collocation_extractor.py

egumasa

construction extractor

5ac114d 7 months ago

raw

history blame contribute delete

4.23 kB

	# Load packages

	import spacy
	import glob
	import pandas as pd

	# Initialize spaCy model
	nlp = spacy.load("en_core_web_sm")

	# Define dependency relations for collocation extraction
	# Common patterns: adj-noun, verb-object, verb-adverb, noun-prep-noun
	COLLOCATION_PATTERNS = [
	("amod", "ADJ", "NOUN"), # adjective modifier (e.g., "big house")
	("dobj", "VERB", "NOUN"), # direct object (e.g., "eat food")
	("advmod", "VERB", "ADV"), # adverb modifier (e.g., "run quickly")
	("nmod", "NOUN", "NOUN"), # noun modifier (e.g., "cup of tea")
	("compound", "NOUN", "NOUN"), # compound nouns (e.g., "computer science")
	]


	# Alternative approach: Class-based design (recommended)

	class CollocationExtractor:
	def __init__(self, patterns=COLLOCATION_PATTERNS):
	self.nlp = spacy.load("en_core_web_sm")
	self.patterns = patterns
	self.results = {
	"corpus_size": 0,
	"unigram": {},
	"bigram": {}
	}

	def process_file(self, filepath):
	"""Process a single file and update internal results"""
	# Load text
	with open(filepath, 'r', encoding='utf-8') as f:
	text = f.read()

	# Parse with spaCy
	doc = self.nlp(text)

	# Extract collocations
	collocations = self._extract_collocations(doc)

	# Update results internally
	self._update_results(text, collocations)

	def _extract_collocations(self, doc):
	"""Extract collocations based on dependency patterns"""
	collocations = []

	for token in doc:
	for dep_rel, pos1, pos2 in self.patterns:
	if token.dep_ == dep_rel:
	if dep_rel in ["amod", "advmod", "compound"]:
	if token.pos_ == pos1 and token.head.pos_ == pos2:
	collocations.append((token.text.lower(), token.head.text.lower(), dep_rel))
	else:
	if token.head.pos_ == pos1 and token.pos_ == pos2:
	collocations.append((token.head.text.lower(), token.text.lower(), dep_rel))

	return collocations

	def _update_results(self, text, collocations):
	"""Update internal results - no need to pass results around"""
	# Update corpus size
	tokens = text.split()
	self.results["corpus_size"] += len(tokens)

	# Update unigram frequencies
	for token in tokens:
	token_lower = token.lower()
	self.results["unigram"][token_lower] = self.results["unigram"].get(token_lower, 0) + 1

	# Update bigram frequencies
	for word1, word2, dep_rel in collocations:
	bigram_key = f"{word1}_{word2}_{dep_rel}"
	if bigram_key not in self.results["bigram"]:
	self.results["bigram"][bigram_key] = {
	"word1": word1,
	"word2": word2,
	"dep_rel": dep_rel,
	"freq": 0
	}
	self.results["bigram"][bigram_key]["freq"] += 1

	def process_corpus(self, files):
	"""Process multiple files"""
	for file in files:
	self.process_file(file)
	print(f"Processed: {file}")

	def get_top_collocations(self, n=20, dep_rel=None):
	"""Get top n collocations, optionally filtered by dependency relation"""
	bigrams = self.results["bigram"]

	if dep_rel:
	bigrams = {k: v for k, v in bigrams.items() if v["dep_rel"] == dep_rel}

	sorted_bigrams = sorted(bigrams.items(), key=lambda x: x[1]["freq"], reverse=True)
	return sorted_bigrams[:n]

	# Usage example
	extractor = CollocationExtractor()
	extractor.process_corpus(CORPUS_FILES[:5])

	print(f"\nCorpus size: {extractor.results['corpus_size']} tokens")
	print(f"Unique words: {len(extractor.results['unigram'])}")
	print(f"Unique collocations: {len(extractor.results['bigram'])}")

	# Show top collocations
	print("\nTop 10 collocations:")
	for key, info in extractor.get_top_collocations(10):
	print(f"{info['word1']} {info['word2']} ({info['dep_rel']}): {info['freq']}")