""" Advanced Dataset Tutorial - Interactive Gradio Demo =================================================== Hugging Face Datasets ile ileri seviye teknikler için interaktif demo """ import gradio as gr import sys import os from pathlib import Path # Modülleri import edebilmek için path ekle sys.path.append(str(Path(__file__).parent / "modules")) # Demo için basit örnekler DEMO_CODES = { "Büyük Ölçekli - Streaming": """ from datasets import load_dataset # Streaming mode - RAM'i patlatmadan büyük veri dataset = load_dataset( "c4", "en", split="train", streaming=True # ✨ Anahtar parametre ) # İlk 1000 örneği işle for i, example in enumerate(dataset.take(1000)): print(f"Example {i}: {example['text'][:100]}...") """, "Büyük Ölçekli - Batch Processing": """ from datasets import load_dataset dataset = load_dataset("imdb", split="train") # ❌ YAVAŞ: Tek tek işleme def process_single(example): return {'length': len(example['text'])} slow = dataset.map(process_single) # ✅ HIZLI: Batch processing def process_batch(examples): return {'length': [len(t) for t in examples['text']]} fast = dataset.map( process_batch, batched=True, # 🚀 10x-100x daha hızlı! batch_size=1000 ) """, "Domain-Specific - Cross-Domain Fix": """ from datasets import Dataset, concatenate_datasets import json # ❌ PROBLEM: Farklı schema'lar sci_data = Dataset.from_dict({ 'text': ['Scientific paper...'], 'metadata': {'year': 2024, 'citations': 10} }) code_data = Dataset.from_dict({ 'code': ['def hello(): pass'], 'language': 'Python' }) # Bu HATA verir! ArrowTypeError # combined = concatenate_datasets([sci_data, code_data]) # ✅ ÇÖZÜM: JSON metadata approach def normalize_to_json(example, domain): return { 'text': example.get('text') or example.get('code'), 'domain': domain, 'metadata_json': json.dumps(example.get('metadata', {})) } sci_norm = sci_data.map(lambda x: normalize_to_json(x, 'scientific')) code_norm = code_data.map(lambda x: normalize_to_json(x, 'code')) # Şimdi ÇALIŞIR! ✅ combined = concatenate_datasets([sci_norm, code_norm]) """, "İleri Teknikler - Custom Collator": """ from datasets import Dataset class AdvancedCollator: def __init__(self, max_length=128, pad_token='[PAD]'): self.max_length = max_length self.pad_token = pad_token def __call__(self, batch): # Tokenize (basit örnek) tokenized = [ex['text'].split()[:self.max_length] for ex in batch] # Dynamic padding - batch içindeki max length'e göre max_len = max(len(tokens) for tokens in tokenized) padded = [] masks = [] for tokens in tokenized: pad_len = max_len - len(tokens) padded.append(tokens + [self.pad_token] * pad_len) masks.append([1] * len(tokens) + [0] * pad_len) return { 'input_tokens': padded, 'attention_mask': masks, 'labels': [ex['label'] for ex in batch] } # Kullanım collator = AdvancedCollator() batch = [ {'text': 'Short text', 'label': 0}, {'text': 'Much longer text here', 'label': 1} ] collated = collator(batch) """, "İleri Teknikler - Data Augmentation": """ from datasets import Dataset import random class DataAugmenter: def augment(self, text): words = text.split() # Random word deletion if random.random() < 0.3: words = [w for w in words if random.random() > 0.1] # Random word swap if len(words) > 1 and random.random() < 0.3: i, j = random.sample(range(len(words)), 2) words[i], words[j] = words[j], words[i] return ' '.join(words) if words else text def augment_dataset(self, dataset, n_augmentations=2): augmented = [] for example in dataset: # Original augmented.append({ **example, 'is_augmented': False }) # Augmented versions for _ in range(n_augmentations): augmented.append({ **example, 'text': self.augment(example['text']), 'is_augmented': True }) return Dataset.from_list(augmented) # Kullanım: 1 örnek → 3 örnek (1 original + 2 augmented) augmenter = DataAugmenter() original = Dataset.from_dict({'text': ['Hello world'], 'label': [0]}) augmented = augmenter.augment_dataset(original, n_augmentations=2) print(f"Dataset boyutu: {len(original)} → {len(augmented)}") """, "Özel Görevler - Question Answering": """ from datasets import Dataset # SQuAD-style QA dataset qa_dataset = Dataset.from_dict({ 'context': [ 'The Eiffel Tower is in Paris. It was built in 1889.' ], 'question': [ 'Where is the Eiffel Tower?' ], 'answers': [{ 'text': ['Paris'], 'answer_start': [23] # Character position }] }) # Preprocessing def preprocess_qa(example): # Answer'ı validate et context = example['context'] answer = example['answers']['text'][0] start = example['answers']['answer_start'][0] # Extract ve kontrol et extracted = context[start:start + len(answer)] is_valid = extracted == answer return { **example, 'is_valid': is_valid, 'question_type': example['question'].split()[0].lower() } qa_processed = qa_dataset.map(preprocess_qa) """, "Özel Görevler - NER": """ from datasets import Dataset # Named Entity Recognition (BIO tagging) ner_dataset = Dataset.from_dict({ 'tokens': [ ['John', 'Smith', 'works', 'at', 'Google'] ], 'ner_tags': [ ['B-PER', 'I-PER', 'O', 'O', 'B-ORG'] ] }) # Tag to ID mapping tag2id = { 'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6 } # Convert tags to IDs def convert_tags(example): return { **example, 'ner_tag_ids': [tag2id[tag] for tag in example['ner_tags']], 'sentence': ' '.join(example['tokens']) } ner_processed = ner_dataset.map(convert_tags) # Entity statistics def count_entities(dataset): entity_types = {} for ex in dataset: for tag in ex['ner_tags']: if tag.startswith('B-'): entity_type = tag.split('-')[1] entity_types[entity_type] = entity_types.get(entity_type, 0) + 1 return entity_types print(count_entities(ner_processed)) """, "Özel Görevler - Sentiment Analysis": """ from datasets import Dataset # Sentiment classification dataset sentiment_dataset = Dataset.from_dict({ 'text': [ 'This product is amazing!', 'Terrible, waste of money.', 'It\\'s okay, nothing special.' ], 'label': [2, 0, 1], # 0: negative, 1: neutral, 2: positive 'label_text': ['positive', 'negative', 'neutral'] }) # Feature extraction def extract_sentiment_features(example): text = example['text'].lower() positive_words = ['amazing', 'great', 'excellent', 'love'] negative_words = ['terrible', 'waste', 'bad', 'poor'] pos_count = sum(1 for word in positive_words if word in text) neg_count = sum(1 for word in negative_words if word in text) return { **example, 'positive_words': pos_count, 'negative_words': neg_count, 'sentiment_score': pos_count - neg_count, 'has_exclamation': '!' in example['text'] } sentiment_featured = sentiment_dataset.map(extract_sentiment_features) # Class balancing with augmentation def balance_classes(dataset, target_per_class=100): from collections import defaultdict # Group by label by_label = defaultdict(list) for ex in dataset: by_label[ex['label']].append(ex) # Augment minority classes balanced = [] for label, examples in by_label.items(): balanced.extend(examples) # Add augmented copies if needed while len([e for e in balanced if e['label'] == label]) < target_per_class: # Simple augmentation: copy with modified text ex = examples[len(balanced) % len(examples)] balanced.append({ **ex, 'is_augmented': True }) return Dataset.from_list(balanced) """ } BEST_PRACTICES = """ # 🎯 Best Practices Özeti ## Memory Efficiency ```python # ✅ DOĞRU: Streaming dataset = load_dataset("huge_data", streaming=True) # ❌ YANLIŞ: Tüm veriyi RAM'e yükleme dataset = load_dataset("huge_data") # 100GB RAM! ``` ## Batch Processing ```python # ✅ DOĞRU: Batched=True dataset.map(fn, batched=True, batch_size=1000) # ❌ YANLIŞ: Tek tek dataset.map(fn) # 10x-100x yavaş! ``` ## Cross-Domain ```python # ✅ DOĞRU: Normalize et def normalize(ex, domain): return {'text': ex.get('text'), 'domain': domain} # ❌ YANLIŞ: Direkt birleştir concatenate_datasets([ds1, ds2]) # Error! ``` ## Performans - **Streaming**: RAM tasarrufu - **Batched**: 10x-100x hız - **num_proc**: CPU parallelization - **Cache**: Tekrar kullanım """ def show_code(module_name): """Seçilen modül için kod göster""" return DEMO_CODES.get(module_name, "Kod örneği yükleniyor...") def show_best_practices(): """Best practices göster""" return BEST_PRACTICES # Gradio Interface with gr.Blocks(title="Advanced Dataset Tutorial", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 📚 Advanced Dataset Tutorial ## Hugging Face Datasets - İleri Seviye Türkçe Eğitim Bu interaktif demo, 4 modül ve 20+ teknik içeren kapsamlı dataset eğitiminin özetini sunar. """) with gr.Tabs(): with gr.Tab("🚀 Kod Örnekleri"): gr.Markdown("### Her modülden pratik kod örnekleri") module_dropdown = gr.Dropdown( choices=list(DEMO_CODES.keys()), label="Modül Seçin", value=list(DEMO_CODES.keys())[0] ) code_output = gr.Code( label="Kod Örneği", language="python", value=DEMO_CODES[list(DEMO_CODES.keys())[0]] ) module_dropdown.change( fn=show_code, inputs=[module_dropdown], outputs=[code_output] ) with gr.Tab("📖 Modüller"): gr.Markdown(""" ## 4 Ana Modül ### 1️⃣ Büyük Ölçekli Datasets - ⚡ Streaming (750GB+ data) - 💾 Batch processing (2.3x hızlı) - 🚀 Multi-processing (64x hızlı) - 📦 Cache (12.1x hızlı) ### 2️⃣ Domain-Specific Datasets - 🔬 Bilimsel makaleler (2,000 örnek) - 💻 Kod datasets (6 dil, 2,000 örnek) - 💰 Finansal veri (2,000 kayıt) - 🏥 Tıbbi veri (PHI anonymization) ### 3️⃣ İleri Teknikler - 📦 Custom Collators (3 tip) - 🔧 Feature Engineering (10+ feature) - 🎲 Data Augmentation (3x veri) - 📊 Advanced Sampling (diversity, stratified) ### 4️⃣ Özel Görevler - ❓ Question Answering (SQuAD) - 📝 Summarization (ROUGE) - 🏷️ NER (BIO tagging) - 😊 Sentiment Analysis - 📊 Multi-Task Learning """) with gr.Tab("🎯 Best Practices"): gr.Code( value=BEST_PRACTICES, label="Best Practices", language="python" ) with gr.Tab("📊 Performans"): gr.Markdown(""" ## Performans Metrikleri | Teknik | Artış | Kullanım | |--------|-------|----------| | **Batch Processing** | 2.3x | Tüm preprocessing | | **Cache** | 12.1x | Tekrar işlemler | | **Multi-Processing** | 64x | CPU tasks | | **Dynamic Batching** | 40% | Padding azalması | | **Data Augmentation** | 3x | Veri artışı | ## İstatistikler - 📝 **5,000+** kod satırı - 🔢 **20,000+** örnek dataset - 🛠️ **50+** teknik - ✅ **100+** best practice ## Kazanımlar ✅ Büyük ölçekli veri işleme ✅ Domain-specific preprocessing ✅ Production-ready pipelines ✅ Task-specific optimization ✅ Multi-task learning """) with gr.Tab("ℹ️ Hakkında"): gr.Markdown(""" ## Proje Bilgileri **Amaç:** Hugging Face Datasets kütüphanesini profesyonel düzeyde kullanmak isteyenler için kapsamlı Türkçe kaynak **İçerik:** - 4 ana modül - 20+ pratik örnek - 50+ teknik - 100+ best practice **Hedef Kitle:** - NLP mühendisleri - ML researchers - Data scientists - AI developers **Lisans:** MIT **Kaynaklar:** - [Hugging Face Datasets Docs](https://huggingface.co/docs/datasets) - [GitHub Repository](https://github.com/yourusername/advanced-dataset-tutorial) - [Hugging Face Hub](https://huggingface.co/datasets) --- ⭐ **Beğendiyseniz yıldız vermeyi unutmayın!** """) gr.Markdown(""" --- 💡 **Not:** Bu demo, tam eğitim materyalinin özeti içindir. Detaylı örnekler ve açıklamalar için modül scriptlerine bakın. """) if __name__ == "__main__": demo.launch()