| """ |
| CROSS-DOMAIN DATASET BİRLEŞTİRME - DOĞRU YÖNTEM |
| =============================================== |
| |
| Bu modül, farklı domain'lerden dataset'leri birleştirirken |
| karşılaşılan schema mismatch problemini çözer ve best practices gösterir. |
| """ |
|
|
| from datasets import Dataset, concatenate_datasets |
| import numpy as np |
| import json |
|
|
| print("="*70) |
| print("🔧 CROSS-DOMAIN DATASET BİRLEŞTİRME - PROBLEM VE ÇÖZÜM") |
| print("="*70) |
|
|
| |
| def generate_scientific_papers(num_samples=100): |
| def gen(): |
| for i in range(num_samples): |
| yield { |
| 'id': f'sci_{i}', |
| 'abstract': f'Scientific text {i}', |
| 'domain': 'Physics', |
| 'year': 2020 + (i % 5) |
| } |
| return Dataset.from_generator(gen) |
|
|
| def generate_code_dataset(num_samples=100): |
| def gen(): |
| for i in range(num_samples): |
| yield { |
| 'id': f'code_{i}', |
| 'code': f'def func_{i}(): pass', |
| 'language': 'Python', |
| 'lines_of_code': 5 |
| } |
| return Dataset.from_generator(gen) |
|
|
| def generate_financial_dataset(num_samples=100): |
| def gen(): |
| for i in range(num_samples): |
| yield { |
| 'id': f'fin_{i}', |
| 'text': f'Company {i} reports earnings', |
| 'sentiment': 'positive', |
| 'company': f'Corp{i}' |
| } |
| return Dataset.from_generator(gen) |
|
|
| print("\n📚 Sample Datasets Oluşturuluyor...") |
| sci_dataset = generate_scientific_papers(100) |
| code_dataset = generate_code_dataset(100) |
| fin_dataset = generate_financial_dataset(100) |
|
|
| print(f"✅ Scientific: {len(sci_dataset)} örnekler") |
| print(f" Kolonlar: {sci_dataset.column_names}") |
| print(f"✅ Code: {len(code_dataset)} örnekler") |
| print(f" Kolonlar: {code_dataset.column_names}") |
| print(f"✅ Financial: {len(fin_dataset)} örnekler") |
| print(f" Kolonlar: {fin_dataset.column_names}") |
|
|
|
|
| print("\n" + "="*70) |
| print("❌ PROBLEM: YANLIŞ YÖNTEM") |
| print("="*70) |
|
|
| print(""" |
| Hatalı Yaklaşım: |
| - Her dataset farklı metadata structure'ı |
| - Schema mismatch hatası |
| - Arrow type error |
| |
| Örnek hatalı kod: |
| metadata: {'type': domain, 'year': year} # Scientific |
| metadata: {'language': lang, 'lines': loc} # Code |
| metadata: {'sentiment': sent, 'company': comp} # Financial |
| |
| ❌ concatenate_datasets() çalışmaz! |
| """) |
|
|
|
|
| print("\n" + "="*70) |
| print("✅ ÇÖZÜM 1: ORTAK SCHEMA - FLATTEN APPROACH") |
| print("="*70) |
|
|
| print("\n🔧 Tüm alanları flatten edelim (en basit çözüm):") |
|
|
| def normalize_to_flat_schema(example, domain_type): |
| """ |
| Tüm alanları ayrı kolonlara çıkar |
| Missing değerler için None kullan |
| """ |
| base = { |
| 'id': example.get('id', ''), |
| 'text': '', |
| 'domain': domain_type, |
| |
| 'abstract': None, |
| 'sci_domain': None, |
| 'year': None, |
| |
| 'code': None, |
| 'language': None, |
| 'lines_of_code': None, |
| |
| 'sentiment': None, |
| 'company': None, |
| } |
| |
| |
| if domain_type == 'scientific': |
| base['text'] = example.get('abstract', '') |
| base['abstract'] = example.get('abstract', '') |
| base['sci_domain'] = example.get('domain', '') |
| base['year'] = example.get('year', None) |
| elif domain_type == 'code': |
| base['text'] = example.get('code', '') |
| base['code'] = example.get('code', '') |
| base['language'] = example.get('language', '') |
| base['lines_of_code'] = example.get('lines_of_code', None) |
| elif domain_type == 'financial': |
| base['text'] = example.get('text', '') |
| base['sentiment'] = example.get('sentiment', '') |
| base['company'] = example.get('company', '') |
| |
| return base |
|
|
| |
| print(" Normalizing scientific dataset...") |
| sci_flat = sci_dataset.map( |
| lambda x: normalize_to_flat_schema(x, 'scientific'), |
| remove_columns=sci_dataset.column_names, |
| desc="Flattening scientific" |
| ) |
|
|
| print(" Normalizing code dataset...") |
| code_flat = code_dataset.map( |
| lambda x: normalize_to_flat_schema(x, 'code'), |
| remove_columns=code_dataset.column_names, |
| desc="Flattening code" |
| ) |
|
|
| print(" Normalizing financial dataset...") |
| fin_flat = fin_dataset.map( |
| lambda x: normalize_to_flat_schema(x, 'financial'), |
| remove_columns=fin_dataset.column_names, |
| desc="Flattening financial" |
| ) |
|
|
| |
| print("\n✅ Birleştiriliyor...") |
| multi_domain_flat = concatenate_datasets([sci_flat, code_flat, fin_flat]) |
|
|
| print(f"\n🎉 BAŞARILI! Multi-domain dataset: {len(multi_domain_flat)} örnek") |
| print(f"Kolonlar: {multi_domain_flat.column_names}") |
|
|
| |
| print("\n📊 Her domain'den örnek:") |
| print("\n1. Scientific örnek:") |
| sci_ex = multi_domain_flat[0] |
| print(f" Domain: {sci_ex['domain']}") |
| print(f" Text: {sci_ex['text'][:50]}...") |
| print(f" Year: {sci_ex['year']}") |
| print(f" Language: {sci_ex['language']}") |
|
|
| print("\n2. Code örnek:") |
| code_ex = multi_domain_flat[100] |
| print(f" Domain: {code_ex['domain']}") |
| print(f" Text: {code_ex['text'][:50]}...") |
| print(f" Language: {code_ex['language']}") |
| print(f" Year: {code_ex['year']}") |
|
|
| print("\n3. Financial örnek:") |
| fin_ex = multi_domain_flat[200] |
| print(f" Domain: {fin_ex['domain']}") |
| print(f" Text: {fin_ex['text'][:50]}...") |
| print(f" Sentiment: {fin_ex['sentiment']}") |
| print(f" Company: {fin_ex['company']}") |
|
|
|
|
| print("\n" + "="*70) |
| print("✅ ÇÖZÜM 2: JSON METADATA - FLEXIBLE APPROACH") |
| print("="*70) |
|
|
| print("\n🔧 Metadata'yı JSON string olarak sakla (daha esnek):") |
|
|
| def normalize_to_json_schema(example, domain_type): |
| """ |
| Domain-specific metadata'yı JSON string olarak sakla |
| Bu yaklaşım daha esnek ve genişletilebilir |
| """ |
| base = { |
| 'id': example.get('id', ''), |
| 'text': '', |
| 'domain': domain_type, |
| 'metadata_json': '' |
| } |
| |
| metadata = {} |
| |
| if domain_type == 'scientific': |
| base['text'] = example.get('abstract', '') |
| metadata = { |
| 'domain': example.get('domain', ''), |
| 'year': example.get('year', None) |
| } |
| elif domain_type == 'code': |
| base['text'] = example.get('code', '') |
| metadata = { |
| 'language': example.get('language', ''), |
| 'lines_of_code': example.get('lines_of_code', None) |
| } |
| elif domain_type == 'financial': |
| base['text'] = example.get('text', '') |
| metadata = { |
| 'sentiment': example.get('sentiment', ''), |
| 'company': example.get('company', '') |
| } |
| |
| base['metadata_json'] = json.dumps(metadata) |
| return base |
|
|
| |
| print(" Normalizing with JSON metadata...") |
| sci_json = sci_dataset.map( |
| lambda x: normalize_to_json_schema(x, 'scientific'), |
| remove_columns=sci_dataset.column_names |
| ) |
| code_json = code_dataset.map( |
| lambda x: normalize_to_json_schema(x, 'code'), |
| remove_columns=code_dataset.column_names |
| ) |
| fin_json = fin_dataset.map( |
| lambda x: normalize_to_json_schema(x, 'financial'), |
| remove_columns=fin_dataset.column_names |
| ) |
|
|
| |
| multi_domain_json = concatenate_datasets([sci_json, code_json, fin_json]) |
|
|
| print(f"\n✅ Multi-domain (JSON): {len(multi_domain_json)} örnek") |
| print(f"Kolonlar: {multi_domain_json.column_names}") |
|
|
| |
| print("\n📊 JSON Metadata Örnekleri:") |
| for i, idx in enumerate([0, 100, 200]): |
| ex = multi_domain_json[idx] |
| metadata = json.loads(ex['metadata_json']) |
| print(f"\n{i+1}. {ex['domain'].capitalize()}:") |
| print(f" Text: {ex['text'][:50]}...") |
| print(f" Metadata: {metadata}") |
|
|
|
|
| print("\n" + "="*70) |
| print("✅ ÇÖZÜM 3: SEPARATE TABLES - DATABASE APPROACH") |
| print("="*70) |
|
|
| print(""" |
| 🗄️ Database-style Approach: |
| |
| Ana tablo (unified): |
| - id |
| - text |
| - domain |
| - reference_id |
| |
| Domain-specific tablolar: |
| - scientific_metadata: reference_id -> {year, domain, ...} |
| - code_metadata: reference_id -> {language, lines, ...} |
| - financial_metadata: reference_id -> {sentiment, company, ...} |
| |
| 장점: |
| ✓ Schema flexibility |
| ✓ Easy to extend |
| ✓ Efficient storage |
| ✓ Type safety |
| |
| 단점: |
| ✗ Join gerekir |
| ✗ Daha kompleks |
| """) |
|
|
| |
| def create_separated_tables(datasets_dict): |
| """ |
| Ana tablo + ayrı metadata tabloları |
| """ |
| |
| unified = [] |
| metadata_tables = { |
| 'scientific': [], |
| 'code': [], |
| 'financial': [] |
| } |
| |
| ref_id = 0 |
| |
| |
| for ex in datasets_dict['scientific']: |
| unified.append({ |
| 'id': ex['id'], |
| 'text': ex['abstract'], |
| 'domain': 'scientific', |
| 'reference_id': ref_id |
| }) |
| metadata_tables['scientific'].append({ |
| 'reference_id': ref_id, |
| 'sci_domain': ex['domain'], |
| 'year': ex['year'] |
| }) |
| ref_id += 1 |
| |
| |
| for ex in datasets_dict['code']: |
| unified.append({ |
| 'id': ex['id'], |
| 'text': ex['code'], |
| 'domain': 'code', |
| 'reference_id': ref_id |
| }) |
| metadata_tables['code'].append({ |
| 'reference_id': ref_id, |
| 'language': ex['language'], |
| 'lines_of_code': ex['lines_of_code'] |
| }) |
| ref_id += 1 |
| |
| |
| for ex in datasets_dict['financial']: |
| unified.append({ |
| 'id': ex['id'], |
| 'text': ex['text'], |
| 'domain': 'financial', |
| 'reference_id': ref_id |
| }) |
| metadata_tables['financial'].append({ |
| 'reference_id': ref_id, |
| 'sentiment': ex['sentiment'], |
| 'company': ex['company'] |
| }) |
| ref_id += 1 |
| |
| return { |
| 'unified': Dataset.from_dict({k: [d[k] for d in unified] for k in unified[0].keys()}), |
| 'metadata': {k: Dataset.from_dict({k: [d[k] for d in v] for k in v[0].keys()}) |
| for k, v in metadata_tables.items()} |
| } |
|
|
| print("\n🔧 Creating separated tables...") |
| separated = create_separated_tables({ |
| 'scientific': sci_dataset, |
| 'code': code_dataset, |
| 'financial': fin_dataset |
| }) |
|
|
| print(f"\n✅ Unified table: {len(separated['unified'])} records") |
| print(f" Columns: {separated['unified'].column_names}") |
|
|
| for domain, meta_table in separated['metadata'].items(): |
| print(f"\n✅ {domain.capitalize()} metadata: {len(meta_table)} records") |
| print(f" Columns: {meta_table.column_names}") |
|
|
| |
| print("\n🔗 Join Example - Scientific record:") |
| unified_ex = separated['unified'][0] |
| ref_id = unified_ex['reference_id'] |
| sci_meta = [ex for ex in separated['metadata']['scientific'] if ex['reference_id'] == ref_id][0] |
|
|
| print(f" Main table: {unified_ex}") |
| print(f" Metadata: {sci_meta}") |
|
|
|
|
| print("\n" + "="*70) |
| print("📚 BEST PRACTICES - CROSS-DOMAIN DATASETS") |
| print("="*70) |
|
|
| print(""" |
| ✅ FLATTEN APPROACH: |
| 장점: |
| - En basit yöntem |
| - Hızlı erişim |
| - Tüm veriler bir yerde |
| 단점: |
| - Çok fazla None değer (sparse) |
| - Schema değişikliği zor |
| - Memory inefficient |
| |
| Ne zaman kullan: |
| - Az sayıda domain |
| - Benzer field'lar |
| - Simple queries |
| |
| ✅ JSON METADATA APPROACH: |
| 장점: |
| - Esnek schema |
| - Kolay extend |
| - Daha az None |
| 단점: |
| - Parse gerekir |
| - Type safety yok |
| - Query daha yavaş |
| |
| Ne zaman kullan: |
| - Çok farklı domain'ler |
| - Sık schema değişikliği |
| - Prototype/exploration |
| |
| ✅ SEPARATE TABLES APPROACH: |
| 장점: |
| - Temiz schema |
| - Type safe |
| - Efficient storage |
| - Professional approach |
| 단점: |
| - Join gerekir |
| - Daha kompleks |
| - Setup overhead |
| |
| Ne zaman kullan: |
| - Production systems |
| - Çok domain |
| - Complex queries |
| - Large scale |
| |
| ✅ HYBRID APPROACH: |
| - Common fields flatten |
| - Rare fields JSON |
| - Best of both worlds |
| |
| Örnek: |
| { |
| 'id': string, |
| 'text': string, |
| 'domain': string, |
| 'common_field_1': value, |
| 'common_field_2': value, |
| 'extra_metadata_json': json_string |
| } |
| |
| 🎯 RECOMMENDATION: |
| Small project → JSON approach |
| Medium project → Flatten approach |
| Large project → Separate tables |
| Research → Hybrid approach |
| """) |
|
|
|
|
| print("\n" + "="*70) |
| print("🔍 KARŞILAŞTIRMA - PERFORMANCE & STORAGE") |
| print("="*70) |
|
|
| import sys |
|
|
| print("\n📊 Memory Usage Comparison:") |
| print(f" Flatten: {sys.getsizeof(multi_domain_flat.data)} bytes") |
| print(f" JSON: {sys.getsizeof(multi_domain_json.data)} bytes") |
| print(f" Separated (unified): {sys.getsizeof(separated['unified'].data)} bytes") |
|
|
| print("\n🚀 Query Speed Simulation:") |
| print(" Flatten: O(1) - Direct column access") |
| print(" JSON: O(1) + parse overhead") |
| print(" Separated: O(log n) - Join required") |
|
|
| print("\n💾 Storage Efficiency:") |
| total_flat = len(multi_domain_flat) * len(multi_domain_flat.column_names) |
| total_json = len(multi_domain_json) * len(multi_domain_json.column_names) |
| total_sep = len(separated['unified']) + sum(len(t) for t in separated['metadata'].values()) |
|
|
| print(f" Flatten: {total_flat} total fields") |
| print(f" JSON: {total_json} total fields") |
| print(f" Separated: {total_sep} total fields") |
|
|
|
|
| print("\n" + "="*70) |
| print("✅ ÇÖZÜM ÖZETİ") |
| print("="*70) |
|
|
| print(""" |
| 🎯 Ana Sorun: |
| ArrowTypeError: struct fields don't match |
| |
| 🔧 Çözümler: |
| 1. Flatten: Tüm field'ları ayrı kolonlara çıkar |
| 2. JSON: Metadata'yı JSON string olarak sakla |
| 3. Separated: Ana tablo + metadata tabloları |
| |
| ✅ En İyi Yaklaşım: |
| - Küçük projeler: JSON |
| - Orta projeler: Flatten + JSON hybrid |
| - Büyük projeler: Separated tables |
| |
| ⚡ Key Takeaway: |
| Farklı schema'ları birleştirmeden önce |
| ortak bir format'a normalize et! |
| """) |
|
|
| print("\n🎉 Problem çözüldü! Artık cross-domain dataset'leri güvenle birleştirebilirsiniz.") |
|
|