import json import os import re import spacy from collections import defaultdict # Load spaCy once nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) def tokenize(text): return re.findall(r"\b\w+\b", text.lower()) def has_bigram(tokens, bigram): parts = bigram.split('_') for i in range(len(tokens) - len(parts) + 1): if tokens[i:i + len(parts)] == parts: return True return False def build_inverse_lemma_map(docs_file_path, cache_path=None): """ Build or load a mapping from lemma -> set of surface forms seen in corpus. If cache_path is provided and exists, loads from it. Else builds from scratch and saves to cache_path. """ if cache_path and os.path.exists(cache_path): print(f"[INFO] Loading cached lemma_to_forms from {cache_path}") with open(cache_path, "r", encoding="utf-8") as f: raw_map = json.load(f) return {lemma: set(forms) for lemma, forms in raw_map.items()} print(f"[INFO] Building inverse lemma map from {docs_file_path}...") lemma_to_forms = defaultdict(set) with open(docs_file_path, 'r', encoding='utf-8') as f: for line in f: doc = json.loads(line) tokens = tokenize(doc['text']) spacy_doc = nlp(" ".join(tokens)) for token in spacy_doc: lemma_to_forms[token.lemma_].add(token.text.lower()) if cache_path: print(f"[INFO] Saving lemma_to_forms to {cache_path}") os.makedirs(os.path.dirname(cache_path), exist_ok=True) with open(cache_path, "w", encoding="utf-8") as f: json.dump({k: list(v) for k, v in lemma_to_forms.items()}, f, indent=2) return lemma_to_forms def build_inverted_index(docs_file_path, vocab_set, lemma_map_path=None): vocab_unigrams = {w for w in vocab_set if '_' not in w} vocab_bigrams = {w for w in vocab_set if '_' in w} # Load or build lemma map lemma_to_forms = build_inverse_lemma_map(docs_file_path, cache_path=lemma_map_path) index = defaultdict(lambda: defaultdict(list)) docs = [] global_seen_words = set() with open(docs_file_path, 'r', encoding='utf-8') as f: for doc_id, line in enumerate(f): doc = json.loads(line) text = doc['text'] timestamp = int(doc['timestamp']) docs.append({"text": text, "timestamp": timestamp}) tokens = tokenize(text) token_set = set(tokens) seen_words = set() # Match all lemma queries using surface forms for lemma in vocab_unigrams: surface_forms = lemma_to_forms.get(lemma, set()) if token_set & surface_forms: index[lemma][timestamp].append(doc_id) seen_words.add(lemma) for bigram in vocab_bigrams: if bigram not in seen_words and has_bigram(tokens, bigram): index[bigram][timestamp].append(doc_id) seen_words.add(bigram) global_seen_words.update(seen_words) if (doc_id + 1) % 500 == 0: missing = vocab_set - global_seen_words print(f"[INFO] After {doc_id+1} docs, {len(missing)} vocab words still not seen.") print("Example missing words:", list(missing)[:5]) missing_final = vocab_set - global_seen_words if missing_final: print(f"[WARNING] {len(missing_final)} vocab words were never found in any document.") print("Examples:", list(missing_final)[:10]) return index, docs, lemma_to_forms def save_index_to_disk(index, index_path): index_clean = { word: {str(ts): doc_ids for ts, doc_ids in ts_dict.items()} for word, ts_dict in index.items() } os.makedirs(os.path.dirname(index_path), exist_ok=True) with open(index_path, "w", encoding='utf-8') as f: json.dump(index_clean, f, ensure_ascii=False) def load_index_from_disk(index_path): with open(index_path, 'r', encoding='utf-8') as f: raw_index = json.load(f) index = defaultdict(lambda: defaultdict(list)) for word, ts_dict in raw_index.items(): for ts, doc_ids in ts_dict.items(): index[word][int(ts)] = doc_ids return index def load_docs(docs_file_path): docs = [] with open(docs_file_path, 'r', encoding='utf-8') as f: for line in f: doc = json.loads(line) docs.append({ "text": doc["text"], "timestamp": int(doc["timestamp"]) }) return docs def load_index(docs_file_path, vocab, index_path=None, lemma_map_path=None): if index_path and os.path.exists(index_path): index = load_index_from_disk(index_path) docs = load_docs(docs_file_path) lemma_to_forms = build_inverse_lemma_map(docs_file_path, cache_path=lemma_map_path) return index, docs, lemma_to_forms index, docs, lemma_to_forms = build_inverted_index( docs_file_path, set(vocab), lemma_map_path=lemma_map_path ) if index_path: save_index_to_disk(index, index_path) return index, docs, lemma_to_forms