Spaces:

uslap
/

uslap-query

Sleeping

App Files Files Community

uslap-query / Code_files /archive /fix_missing_migration.py

uslap

Upload folder using huggingface_hub

7cc8e29 verified 3 months ago

Raw

History Blame Contribute Delete

16.6 kB

	#!/usr/bin/env python3
	"""
	Fix missing data in USLaP database migration:
	1. Populate cross_refs from A5_CROSS_REFS sheet
	2. Update ru_term in existing entries from A1_ЗАПИСИ sheet
	3. Update fa_term in existing entries from PERSIAN_A1_MADĀKHIL sheet
	4. Insert new entries for BITIG_A1_ENTRIES with ORIG2 prefix
	"""

	import sqlite3
	import openpyxl
	import re
	import sys
	import os
	from datetime import datetime

	EXCEL_PATH = "USLaP_Final_Data_Consolidated_Master_v3.xlsx"
	DB_PATH = "Code_files/uslap_lattice.db"

	def clean_column_name(col):
	"""Convert Excel column header to valid SQLite column name."""
	if col is None:
	return "unknown"
	col = str(col).strip()
	col = re.sub(r'[^\w\s]', '', col) # Remove punctuation
	col = re.sub(r'\s+', '_', col) # Replace spaces with underscore
	col = col.lower()
	if not col:
	return "unknown"
	return col

	def fix_cross_refs(conn, cursor, wb):
	"""Fix cross_refs migration - insert missing cross-references."""
	print(" Fixing cross_refs migration...")

	if "A5_CROSS_REFS" not in wb.sheetnames:
	print(" Sheet A5_CROSS_REFS not found")
	return 0

	ws = wb["A5_CROSS_REFS"]
	rows = list(ws.iter_rows(values_only=True))

	if not rows or len(rows) < 2:
	print(" No data in A5_CROSS_REFS")
	return 0

	# First row is headers
	headers = [clean_column_name(cell) for cell in rows[0]]

	count = 0
	for i, row in enumerate(rows[1:], start=1):
	if not any(row):
	continue

	row_dict = {}
	for j, cell in enumerate(row):
	if j < len(headers):
	row_dict[headers[j]] = cell

	# Map columns according to Excel structure
	xref_id = row_dict.get('xref_id')
	from_id = row_dict.get('from_id')
	to_id = row_dict.get('to_id')
	link_type = row_dict.get('link_type')
	description = row_dict.get('description')
	layer_ref = row_dict.get('layer_ref')

	# Skip if missing critical data
	if not xref_id or from_id is None or to_id is None:
	continue

	# Convert IDs to integers if needed
	try:
	from_id = int(from_id)
	to_id = int(to_id)
	except (ValueError, TypeError):
	print(f" Row {i}: Invalid ID format - from_id={from_id}, to_id={to_id}")
	continue

	# Check if entry IDs exist
	cursor.execute('SELECT 1 FROM entries WHERE entry_id = ?', (from_id,))
	if not cursor.fetchone():
	print(f" Row {i}: from_entry_id {from_id} not found in entries")
	continue

	cursor.execute('SELECT 1 FROM entries WHERE entry_id = ?', (to_id,))
	if not cursor.fetchone():
	print(f" Row {i}: to_entry_id {to_id} not found in entries")
	continue

	# Check if cross-ref already exists
	cursor.execute('''
	SELECT 1 FROM cross_refs
	WHERE from_entry_id = ? AND to_entry_id = ? AND link_type = ?
	''', (from_id, to_id, link_type))

	if cursor.fetchone():
	# Already exists, skip
	continue

	try:
	cursor.execute('''
	INSERT INTO cross_refs (
	xref_id, from_entry_id, to_entry_id, link_type, description, layer_ref
	) VALUES (?, ?, ?, ?, ?, ?)
	''', (
	xref_id, from_id, to_id, link_type, description, layer_ref
	))
	count += 1
	except Exception as e:
	print(f" Error inserting row {i}: {e}")
	print(f" Data: xref_id={xref_id}, from_id={from_id}, to_id={to_id}, link_type={link_type}")

	print(f" Inserted {count} cross-references")
	return count

	def fix_russian_entries(conn, cursor, wb):
	"""Fix Russian entries - update ru_term in existing entries."""
	print(" Fixing Russian entries (A1_ЗАПИСИ)...")

	if "A1_ЗАПИСИ" not in wb.sheetnames:
	print(" Sheet A1_ЗАПИСИ not found")
	return 0

	ws = wb["A1_ЗАПИСИ"]
	rows = list(ws.iter_rows(values_only=True))

	if not rows or len(rows) < 2:
	print(" No data in A1_ЗАПИСИ")
	return 0

	# First row is headers (Russian column names)
	headers = [clean_column_name(cell) for cell in rows[0]]

	count = 0
	for i, row in enumerate(rows[1:], start=1):
	if not any(row):
	continue

	row_dict = {}
	for j, cell in enumerate(row):
	if j < len(headers):
	row_dict[headers[j]] = cell

	# Get entry data
	entry_id = row_dict.get('запись_id')
	ru_term = row_dict.get('рус_термин')

	if not entry_id or not ru_term:
	continue

	# Convert entry_id to int if needed
	try:
	entry_id = int(entry_id)
	except (ValueError, TypeError):
	print(f" Row {i}: Invalid entry_id format: {entry_id}")
	continue

	# Check if entry exists
	cursor.execute('SELECT 1 FROM entries WHERE entry_id = ?', (entry_id,))
	if not cursor.fetchone():
	print(f" Row {i}: entry_id {entry_id} not found in entries")
	continue

	# Update ru_term
	try:
	cursor.execute('''
	UPDATE entries
	SET ru_term = ?, modified_at = CURRENT_TIMESTAMP
	WHERE entry_id = ?
	''', (ru_term, entry_id))

	if cursor.rowcount > 0:
	count += 1
	except Exception as e:
	print(f" Error updating row {i}: {e}")
	print(f" Data: entry_id={entry_id}, ru_term={ru_term}")

	print(f" Updated {count} Russian entries")
	return count

	def fix_persian_entries(conn, cursor, wb):
	"""Fix Persian entries - update fa_term in existing entries."""
	print(" Fixing Persian entries (PERSIAN_A1_MADĀKHIL)...")

	if "PERSIAN_A1_MADĀKHIL" not in wb.sheetnames:
	print(" Sheet PERSIAN_A1_MADĀKHIL not found")
	return 0

	ws = wb["PERSIAN_A1_MADĀKHIL"]
	rows = list(ws.iter_rows(values_only=True))

	if not rows or len(rows) < 2:
	print(" No data in PERSIAN_A1_MADĀKHIL")
	return 0

	# First row has complex headers with Persian and English
	headers = [clean_column_name(cell) for cell in rows[0]]

	count = 0
	for i, row in enumerate(rows[1:], start=1):
	if not any(row):
	continue

	row_dict = {}
	for j, cell in enumerate(row):
	if j < len(headers):
	row_dict[headers[j]] = cell

	# Map to entries table schema
	entry_id = row_dict.get('madkhal_identry_id')
	fa_term = row_dict.get('vazhe_farsipersian_term')
	source_word = row_dict.get('kalame_aslisource_word')

	if not entry_id:
	continue

	# Convert entry_id to int if needed
	try:
	entry_id = int(entry_id)
	except (ValueError, TypeError):
	print(f" Row {i}: Invalid entry_id format: {entry_id}")
	continue

	# Check if entry exists
	cursor.execute('SELECT 1 FROM entries WHERE entry_id = ?', (entry_id,))
	if not cursor.fetchone():
	print(f" Row {i}: entry_id {entry_id} not found in entries")
	continue

	# Update fa_term and ar_word if source_word is provided
	try:
	if fa_term:
	cursor.execute('''
	UPDATE entries
	SET fa_term = ?, modified_at = CURRENT_TIMESTAMP
	WHERE entry_id = ?
	''', (fa_term, entry_id))

	if cursor.rowcount > 0:
	count += 1

	# Also update ar_word if source_word is provided
	if source_word:
	cursor.execute('''
	UPDATE entries
	SET ar_word = ?, modified_at = CURRENT_TIMESTAMP
	WHERE entry_id = ? AND (ar_word IS NULL OR ar_word = '')
	''', (source_word, entry_id))

	except Exception as e:
	print(f" Error updating row {i}: {e}")
	print(f" Data: entry_id={entry_id}, fa_term={fa_term}")

	print(f" Updated {count} Persian entries")
	return count

	def fix_bitig_entries(conn, cursor, wb):
	"""Fix Bitig entries - insert as new rows with ORIG2 prefix."""
	print(" Fixing Bitig entries (BITIG_A1_ENTRIES)...")

	if "BITIG_A1_ENTRIES" not in wb.sheetnames:
	print(" Sheet BITIG_A1_ENTRIES not found")
	return 0

	ws = wb["BITIG_A1_ENTRIES"]
	rows = list(ws.iter_rows(values_only=True))

	if not rows or len(rows) < 2:
	print(" No data in BITIG_A1_ENTRIES")
	return 0

	headers = [clean_column_name(cell) for cell in rows[0]]

	count = 0
	for i, row in enumerate(rows[1:], start=1):
	if not any(row):
	continue

	row_dict = {}
	for j, cell in enumerate(row):
	if j < len(headers):
	row_dict[headers[j]] = cell

	# Get the original entry_id from Excel
	excel_entry_id = row_dict.get('entry_id')

	# Check if this entry_id already exists
	cursor.execute('SELECT 1 FROM entries WHERE entry_id = ?', (excel_entry_id,))
	if cursor.fetchone():
	# Entry already exists, skip
	print(f" Row {i}: entry_id {excel_entry_id} already exists, skipping")
	continue

	# Get next available entry_id
	cursor.execute('SELECT MAX(entry_id) FROM entries')
	max_id = cursor.fetchone()[0] or 0
	new_entry_id = max_id + 1

	# Prepare entry data
	entry_data = {
	'entry_id': new_entry_id,
	'score': row_dict.get('score', 5),
	'en_term': row_dict.get('orig2_term'),
	'ar_word': row_dict.get('orig2_script'),
	'root_letters': row_dict.get('root_letters'),
	'phonetic_chain': row_dict.get('phonetic_chain'),
	'notes': row_dict.get('notes'),
	}

	# Extract root ID if present
	root_id = None
	root_letters = entry_data['root_letters']
	if root_letters:
	# Try to find existing root with these letters
	cursor.execute('SELECT root_id FROM roots WHERE root_letters = ?', (root_letters,))
	result = cursor.fetchone()
	if result:
	root_id = result[0]
	else:
	# Create new root if needed
	cursor.execute('SELECT COUNT(*) FROM roots')
	root_count = cursor.fetchone()[0]
	root_id = f"R{root_count + 1:03d}"

	# Create bare root (without hyphens)
	root_bare = re.sub(r'[\-\s]', '', root_letters) if root_letters else ''
	root_type = 'TRILITERAL' if len(root_bare) == 3 else 'QUADRILITERAL' if len(root_bare) == 4 else 'OTHER'

	cursor.execute('''
	INSERT INTO roots (root_id, root_letters, root_bare, root_type, notes)
	VALUES (?, ?, ?, ?, ?)
	''', (
	root_id, root_letters, root_bare, root_type,
	f"Created during Bitig entries fix for entry {new_entry_id}"
	))

	# For ORIG2 entries, mark with ORIG2 prefix in foundation_refs
	kashgari_attestation = row_dict.get('kashgari_attestation', '')
	foundation_refs = f"ORIG2: {kashgari_attestation}"

	try:
	cursor.execute('''
	INSERT INTO entries (
	entry_id, score, en_term, ar_word, root_id, root_letters,
	phonetic_chain, foundation_refs, notes
	) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
	''', (
	entry_data['entry_id'], entry_data['score'], entry_data['en_term'],
	entry_data['ar_word'], root_id, entry_data['root_letters'],
	entry_data['phonetic_chain'], foundation_refs, entry_data['notes']
	))
	count += 1
	except Exception as e:
	print(f" Error inserting Bitig row {i}: {e}")
	print(f" Data: entry_id={new_entry_id}, en_term={entry_data['en_term']}")

	print(f" Inserted {count} Bitig entries")
	return count

	def main():
	print("\n" + "═" * 70)
	print(" USLaP Migration Fix: Missing Data")
	print(" بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ")
	print("═" * 70)

	# Check if files exist
	if not os.path.exists(EXCEL_PATH):
	print(f"\n❌ ERROR: Excel file not found: {EXCEL_PATH}")
	sys.exit(1)

	if not os.path.exists(DB_PATH):
	print(f"\n❌ ERROR: Database file not found: {DB_PATH}")
	sys.exit(1)

	# Load Excel workbook
	print(f"\n📖 Loading Excel file: {EXCEL_PATH}")
	try:
	wb = openpyxl.load_workbook(EXCEL_PATH, read_only=True, data_only=True)
	except Exception as e:
	print(f"\n❌ ERROR: Failed to load Excel file: {e}")
	sys.exit(1)

	# Connect to database
	print(f"\n🗄️ Connecting to database: {DB_PATH}")
	try:
	conn = sqlite3.connect(DB_PATH)
	cursor = conn.cursor()

	# Enable foreign keys
	cursor.execute("PRAGMA foreign_keys = ON")

	# Begin transaction
	conn.execute("BEGIN TRANSACTION")

	print("\n🔧 Fixing migration issues...")

	# Fix cross_refs
	cross_refs_count = fix_cross_refs(conn, cursor, wb)

	# Fix Russian entries
	russian_count = fix_russian_entries(conn, cursor, wb)

	# Fix Persian entries
	persian_count = fix_persian_entries(conn, cursor, wb)

	# Fix Bitig entries
	bitig_count = fix_bitig_entries(conn, cursor, wb)

	# Commit transaction
	conn.commit()

	# Generate statistics
	print("\n📈 Fix Statistics:")
	print("─" * 40)

	cursor.execute("SELECT COUNT(*) FROM entries")
	entries_count = cursor.fetchone()[0]
	print(f" Total entries: {entries_count}")

	cursor.execute("SELECT COUNT(*) FROM cross_refs")
	crossrefs_count = cursor.fetchone()[0]
	print(f" Total cross-references: {crossrefs_count}")

	cursor.execute("SELECT COUNT(*) FROM entries WHERE ru_term IS NOT NULL AND ru_term != ''")
	ru_count = cursor.fetchone()[0]
	print(f" Entries with ru_term: {ru_count}")

	cursor.execute("SELECT COUNT(*) FROM entries WHERE fa_term IS NOT NULL AND fa_term != ''")
	fa_count = cursor.fetchone()[0]
	print(f" Entries with fa_term: {fa_count}")

	cursor.execute("SELECT COUNT(*) FROM entries WHERE foundation_refs LIKE 'ORIG2:%'")
	orig2_count = cursor.fetchone()[0]
	print(f" ORIG2 entries: {orig2_count}")

	# Verify foreign keys
	print("\n🔍 Verifying foreign key constraints...")
	cursor.execute("PRAGMA foreign_key_check")
	fk_errors = cursor.fetchall()

	if fk_errors:
	print("❌ Foreign key errors found:")
	for error in fk_errors:
	print(f" Table: {error[0]}, Row: {error[1]}, Referenced: {error[2]}, FK Index: {error[3]}")
	else:
	print("✓ All foreign key constraints satisfied")

	# Close workbook and connection
	wb.close()
	conn.close()

	print("\n" + "═" * 70)
	print("✅ MIGRATION FIX COMPLETED SUCCESSFULLY")
	print("═" * 70)

	except Exception as e:
	print(f"\n❌ ERROR during migration fix: {e}")
	import traceback
	traceback.print_exc()

	# Rollback if connection is still open
	try:
	conn.rollback()
	conn.close()
	except:
	pass

	print("\n⚠️ Migration fix failed. Database has been rolled back.")
	sys.exit(1)

	if __name__ == "__main__":
	main()