advanced-dataset-tutorial / space /modules /02b_cross_domain_fix.py
MEHMET TUĞRUL KAYA
Initial commit: Advanced Dataset Tutorial
2e6a47d
"""
CROSS-DOMAIN DATASET BİRLEŞTİRME - DOĞRU YÖNTEM
===============================================
Bu modül, farklı domain'lerden dataset'leri birleştirirken
karşılaşılan schema mismatch problemini çözer ve best practices gösterir.
"""
from datasets import Dataset, concatenate_datasets
import numpy as np
import json
print("="*70)
print("🔧 CROSS-DOMAIN DATASET BİRLEŞTİRME - PROBLEM VE ÇÖZÜM")
print("="*70)
# Sentetik dataset'ler oluştur
def generate_scientific_papers(num_samples=100):
def gen():
for i in range(num_samples):
yield {
'id': f'sci_{i}',
'abstract': f'Scientific text {i}',
'domain': 'Physics',
'year': 2020 + (i % 5)
}
return Dataset.from_generator(gen)
def generate_code_dataset(num_samples=100):
def gen():
for i in range(num_samples):
yield {
'id': f'code_{i}',
'code': f'def func_{i}(): pass',
'language': 'Python',
'lines_of_code': 5
}
return Dataset.from_generator(gen)
def generate_financial_dataset(num_samples=100):
def gen():
for i in range(num_samples):
yield {
'id': f'fin_{i}',
'text': f'Company {i} reports earnings',
'sentiment': 'positive',
'company': f'Corp{i}'
}
return Dataset.from_generator(gen)
print("\n📚 Sample Datasets Oluşturuluyor...")
sci_dataset = generate_scientific_papers(100)
code_dataset = generate_code_dataset(100)
fin_dataset = generate_financial_dataset(100)
print(f"✅ Scientific: {len(sci_dataset)} örnekler")
print(f" Kolonlar: {sci_dataset.column_names}")
print(f"✅ Code: {len(code_dataset)} örnekler")
print(f" Kolonlar: {code_dataset.column_names}")
print(f"✅ Financial: {len(fin_dataset)} örnekler")
print(f" Kolonlar: {fin_dataset.column_names}")
print("\n" + "="*70)
print("❌ PROBLEM: YANLIŞ YÖNTEM")
print("="*70)
print("""
Hatalı Yaklaşım:
- Her dataset farklı metadata structure'ı
- Schema mismatch hatası
- Arrow type error
Örnek hatalı kod:
metadata: {'type': domain, 'year': year} # Scientific
metadata: {'language': lang, 'lines': loc} # Code
metadata: {'sentiment': sent, 'company': comp} # Financial
❌ concatenate_datasets() çalışmaz!
""")
print("\n" + "="*70)
print("✅ ÇÖZÜM 1: ORTAK SCHEMA - FLATTEN APPROACH")
print("="*70)
print("\n🔧 Tüm alanları flatten edelim (en basit çözüm):")
def normalize_to_flat_schema(example, domain_type):
"""
Tüm alanları ayrı kolonlara çıkar
Missing değerler için None kullan
"""
base = {
'id': example.get('id', ''),
'text': '',
'domain': domain_type,
# Scientific fields
'abstract': None,
'sci_domain': None,
'year': None,
# Code fields
'code': None,
'language': None,
'lines_of_code': None,
# Financial fields
'sentiment': None,
'company': None,
}
# Domain'e göre doldur
if domain_type == 'scientific':
base['text'] = example.get('abstract', '')
base['abstract'] = example.get('abstract', '')
base['sci_domain'] = example.get('domain', '')
base['year'] = example.get('year', None)
elif domain_type == 'code':
base['text'] = example.get('code', '')
base['code'] = example.get('code', '')
base['language'] = example.get('language', '')
base['lines_of_code'] = example.get('lines_of_code', None)
elif domain_type == 'financial':
base['text'] = example.get('text', '')
base['sentiment'] = example.get('sentiment', '')
base['company'] = example.get('company', '')
return base
# Her dataset'i normalize et
print(" Normalizing scientific dataset...")
sci_flat = sci_dataset.map(
lambda x: normalize_to_flat_schema(x, 'scientific'),
remove_columns=sci_dataset.column_names,
desc="Flattening scientific"
)
print(" Normalizing code dataset...")
code_flat = code_dataset.map(
lambda x: normalize_to_flat_schema(x, 'code'),
remove_columns=code_dataset.column_names,
desc="Flattening code"
)
print(" Normalizing financial dataset...")
fin_flat = fin_dataset.map(
lambda x: normalize_to_flat_schema(x, 'financial'),
remove_columns=fin_dataset.column_names,
desc="Flattening financial"
)
# Şimdi birleştir - ÇALIŞIR!
print("\n✅ Birleştiriliyor...")
multi_domain_flat = concatenate_datasets([sci_flat, code_flat, fin_flat])
print(f"\n🎉 BAŞARILI! Multi-domain dataset: {len(multi_domain_flat)} örnek")
print(f"Kolonlar: {multi_domain_flat.column_names}")
# Örnekleri göster
print("\n📊 Her domain'den örnek:")
print("\n1. Scientific örnek:")
sci_ex = multi_domain_flat[0]
print(f" Domain: {sci_ex['domain']}")
print(f" Text: {sci_ex['text'][:50]}...")
print(f" Year: {sci_ex['year']}")
print(f" Language: {sci_ex['language']}") # None olmalı
print("\n2. Code örnek:")
code_ex = multi_domain_flat[100]
print(f" Domain: {code_ex['domain']}")
print(f" Text: {code_ex['text'][:50]}...")
print(f" Language: {code_ex['language']}")
print(f" Year: {code_ex['year']}") # None olmalı
print("\n3. Financial örnek:")
fin_ex = multi_domain_flat[200]
print(f" Domain: {fin_ex['domain']}")
print(f" Text: {fin_ex['text'][:50]}...")
print(f" Sentiment: {fin_ex['sentiment']}")
print(f" Company: {fin_ex['company']}")
print("\n" + "="*70)
print("✅ ÇÖZÜM 2: JSON METADATA - FLEXIBLE APPROACH")
print("="*70)
print("\n🔧 Metadata'yı JSON string olarak sakla (daha esnek):")
def normalize_to_json_schema(example, domain_type):
"""
Domain-specific metadata'yı JSON string olarak sakla
Bu yaklaşım daha esnek ve genişletilebilir
"""
base = {
'id': example.get('id', ''),
'text': '',
'domain': domain_type,
'metadata_json': ''
}
metadata = {}
if domain_type == 'scientific':
base['text'] = example.get('abstract', '')
metadata = {
'domain': example.get('domain', ''),
'year': example.get('year', None)
}
elif domain_type == 'code':
base['text'] = example.get('code', '')
metadata = {
'language': example.get('language', ''),
'lines_of_code': example.get('lines_of_code', None)
}
elif domain_type == 'financial':
base['text'] = example.get('text', '')
metadata = {
'sentiment': example.get('sentiment', ''),
'company': example.get('company', '')
}
base['metadata_json'] = json.dumps(metadata)
return base
# Normalize
print(" Normalizing with JSON metadata...")
sci_json = sci_dataset.map(
lambda x: normalize_to_json_schema(x, 'scientific'),
remove_columns=sci_dataset.column_names
)
code_json = code_dataset.map(
lambda x: normalize_to_json_schema(x, 'code'),
remove_columns=code_dataset.column_names
)
fin_json = fin_dataset.map(
lambda x: normalize_to_json_schema(x, 'financial'),
remove_columns=fin_dataset.column_names
)
# Birleştir
multi_domain_json = concatenate_datasets([sci_json, code_json, fin_json])
print(f"\n✅ Multi-domain (JSON): {len(multi_domain_json)} örnek")
print(f"Kolonlar: {multi_domain_json.column_names}")
# Metadata'yı parse et
print("\n📊 JSON Metadata Örnekleri:")
for i, idx in enumerate([0, 100, 200]):
ex = multi_domain_json[idx]
metadata = json.loads(ex['metadata_json'])
print(f"\n{i+1}. {ex['domain'].capitalize()}:")
print(f" Text: {ex['text'][:50]}...")
print(f" Metadata: {metadata}")
print("\n" + "="*70)
print("✅ ÇÖZÜM 3: SEPARATE TABLES - DATABASE APPROACH")
print("="*70)
print("""
🗄️ Database-style Approach:
Ana tablo (unified):
- id
- text
- domain
- reference_id
Domain-specific tablolar:
- scientific_metadata: reference_id -> {year, domain, ...}
- code_metadata: reference_id -> {language, lines, ...}
- financial_metadata: reference_id -> {sentiment, company, ...}
장점:
✓ Schema flexibility
✓ Easy to extend
✓ Efficient storage
✓ Type safety
단점:
✗ Join gerekir
✗ Daha kompleks
""")
# Simple implementation
def create_separated_tables(datasets_dict):
"""
Ana tablo + ayrı metadata tabloları
"""
# Ana tablo
unified = []
metadata_tables = {
'scientific': [],
'code': [],
'financial': []
}
ref_id = 0
# Scientific
for ex in datasets_dict['scientific']:
unified.append({
'id': ex['id'],
'text': ex['abstract'],
'domain': 'scientific',
'reference_id': ref_id
})
metadata_tables['scientific'].append({
'reference_id': ref_id,
'sci_domain': ex['domain'],
'year': ex['year']
})
ref_id += 1
# Code
for ex in datasets_dict['code']:
unified.append({
'id': ex['id'],
'text': ex['code'],
'domain': 'code',
'reference_id': ref_id
})
metadata_tables['code'].append({
'reference_id': ref_id,
'language': ex['language'],
'lines_of_code': ex['lines_of_code']
})
ref_id += 1
# Financial
for ex in datasets_dict['financial']:
unified.append({
'id': ex['id'],
'text': ex['text'],
'domain': 'financial',
'reference_id': ref_id
})
metadata_tables['financial'].append({
'reference_id': ref_id,
'sentiment': ex['sentiment'],
'company': ex['company']
})
ref_id += 1
return {
'unified': Dataset.from_dict({k: [d[k] for d in unified] for k in unified[0].keys()}),
'metadata': {k: Dataset.from_dict({k: [d[k] for d in v] for k in v[0].keys()})
for k, v in metadata_tables.items()}
}
print("\n🔧 Creating separated tables...")
separated = create_separated_tables({
'scientific': sci_dataset,
'code': code_dataset,
'financial': fin_dataset
})
print(f"\n✅ Unified table: {len(separated['unified'])} records")
print(f" Columns: {separated['unified'].column_names}")
for domain, meta_table in separated['metadata'].items():
print(f"\n✅ {domain.capitalize()} metadata: {len(meta_table)} records")
print(f" Columns: {meta_table.column_names}")
# Join örneği
print("\n🔗 Join Example - Scientific record:")
unified_ex = separated['unified'][0]
ref_id = unified_ex['reference_id']
sci_meta = [ex for ex in separated['metadata']['scientific'] if ex['reference_id'] == ref_id][0]
print(f" Main table: {unified_ex}")
print(f" Metadata: {sci_meta}")
print("\n" + "="*70)
print("📚 BEST PRACTICES - CROSS-DOMAIN DATASETS")
print("="*70)
print("""
✅ FLATTEN APPROACH:
장점:
- En basit yöntem
- Hızlı erişim
- Tüm veriler bir yerde
단점:
- Çok fazla None değer (sparse)
- Schema değişikliği zor
- Memory inefficient
Ne zaman kullan:
- Az sayıda domain
- Benzer field'lar
- Simple queries
✅ JSON METADATA APPROACH:
장점:
- Esnek schema
- Kolay extend
- Daha az None
단점:
- Parse gerekir
- Type safety yok
- Query daha yavaş
Ne zaman kullan:
- Çok farklı domain'ler
- Sık schema değişikliği
- Prototype/exploration
✅ SEPARATE TABLES APPROACH:
장점:
- Temiz schema
- Type safe
- Efficient storage
- Professional approach
단점:
- Join gerekir
- Daha kompleks
- Setup overhead
Ne zaman kullan:
- Production systems
- Çok domain
- Complex queries
- Large scale
✅ HYBRID APPROACH:
- Common fields flatten
- Rare fields JSON
- Best of both worlds
Örnek:
{
'id': string,
'text': string,
'domain': string,
'common_field_1': value,
'common_field_2': value,
'extra_metadata_json': json_string
}
🎯 RECOMMENDATION:
Small project → JSON approach
Medium project → Flatten approach
Large project → Separate tables
Research → Hybrid approach
""")
print("\n" + "="*70)
print("🔍 KARŞILAŞTIRMA - PERFORMANCE & STORAGE")
print("="*70)
import sys
print("\n📊 Memory Usage Comparison:")
print(f" Flatten: {sys.getsizeof(multi_domain_flat.data)} bytes")
print(f" JSON: {sys.getsizeof(multi_domain_json.data)} bytes")
print(f" Separated (unified): {sys.getsizeof(separated['unified'].data)} bytes")
print("\n🚀 Query Speed Simulation:")
print(" Flatten: O(1) - Direct column access")
print(" JSON: O(1) + parse overhead")
print(" Separated: O(log n) - Join required")
print("\n💾 Storage Efficiency:")
total_flat = len(multi_domain_flat) * len(multi_domain_flat.column_names)
total_json = len(multi_domain_json) * len(multi_domain_json.column_names)
total_sep = len(separated['unified']) + sum(len(t) for t in separated['metadata'].values())
print(f" Flatten: {total_flat} total fields")
print(f" JSON: {total_json} total fields")
print(f" Separated: {total_sep} total fields")
print("\n" + "="*70)
print("✅ ÇÖZÜM ÖZETİ")
print("="*70)
print("""
🎯 Ana Sorun:
ArrowTypeError: struct fields don't match
🔧 Çözümler:
1. Flatten: Tüm field'ları ayrı kolonlara çıkar
2. JSON: Metadata'yı JSON string olarak sakla
3. Separated: Ana tablo + metadata tabloları
✅ En İyi Yaklaşım:
- Küçük projeler: JSON
- Orta projeler: Flatten + JSON hybrid
- Büyük projeler: Separated tables
⚡ Key Takeaway:
Farklı schema'ları birleştirmeden önce
ortak bir format'a normalize et!
""")
print("\n🎉 Problem çözüldü! Artık cross-domain dataset'leri güvenle birleştirebilirsiniz.")