Spaces:

tugrulkaya
/

advanced-dataset-tutorial

Sleeping

File size: 14,236 Bytes

2e6a47d

"""
Advanced Dataset Tutorial - Interactive Gradio Demo
===================================================

Hugging Face Datasets ile ileri seviye teknikler için interaktif demo
"""

import gradio as gr
import sys
import os
from pathlib import Path

# Modülleri import edebilmek için path ekle
sys.path.append(str(Path(__file__).parent / "modules"))

# Demo için basit örnekler
DEMO_CODES = {
    "Büyük Ölçekli - Streaming": """
from datasets import load_dataset

# Streaming mode - RAM'i patlatmadan büyük veri
dataset = load_dataset(
    "c4", 
    "en", 
    split="train",
    streaming=True  # ✨ Anahtar parametre
)

# İlk 1000 örneği işle
for i, example in enumerate(dataset.take(1000)):
    print(f"Example {i}: {example['text'][:100]}...")
""",

    "Büyük Ölçekli - Batch Processing": """
from datasets import load_dataset

dataset = load_dataset("imdb", split="train")

# ❌ YAVAŞ: Tek tek işleme
def process_single(example):
    return {'length': len(example['text'])}

slow = dataset.map(process_single)

# ✅ HIZLI: Batch processing
def process_batch(examples):
    return {'length': [len(t) for t in examples['text']]}

fast = dataset.map(
    process_batch,
    batched=True,      # 🚀 10x-100x daha hızlı!
    batch_size=1000
)
""",

    "Domain-Specific - Cross-Domain Fix": """
from datasets import Dataset, concatenate_datasets
import json

# ❌ PROBLEM: Farklı schema'lar
sci_data = Dataset.from_dict({
    'text': ['Scientific paper...'],
    'metadata': {'year': 2024, 'citations': 10}
})

code_data = Dataset.from_dict({
    'code': ['def hello(): pass'],
    'language': 'Python'
})

# Bu HATA verir! ArrowTypeError
# combined = concatenate_datasets([sci_data, code_data])

# ✅ ÇÖZÜM: JSON metadata approach
def normalize_to_json(example, domain):
    return {
        'text': example.get('text') or example.get('code'),
        'domain': domain,
        'metadata_json': json.dumps(example.get('metadata', {}))
    }

sci_norm = sci_data.map(lambda x: normalize_to_json(x, 'scientific'))
code_norm = code_data.map(lambda x: normalize_to_json(x, 'code'))

# Şimdi ÇALIŞIR! ✅
combined = concatenate_datasets([sci_norm, code_norm])
""",

    "İleri Teknikler - Custom Collator": """
from datasets import Dataset

class AdvancedCollator:
    def __init__(self, max_length=128, pad_token='[PAD]'):
        self.max_length = max_length
        self.pad_token = pad_token
    
    def __call__(self, batch):
        # Tokenize (basit örnek)
        tokenized = [ex['text'].split()[:self.max_length] 
                     for ex in batch]
        
        # Dynamic padding - batch içindeki max length'e göre
        max_len = max(len(tokens) for tokens in tokenized)
        
        padded = []
        masks = []
        for tokens in tokenized:
            pad_len = max_len - len(tokens)
            padded.append(tokens + [self.pad_token] * pad_len)
            masks.append([1] * len(tokens) + [0] * pad_len)
        
        return {
            'input_tokens': padded,
            'attention_mask': masks,
            'labels': [ex['label'] for ex in batch]
        }

# Kullanım
collator = AdvancedCollator()
batch = [
    {'text': 'Short text', 'label': 0},
    {'text': 'Much longer text here', 'label': 1}
]
collated = collator(batch)
""",

    "İleri Teknikler - Data Augmentation": """
from datasets import Dataset
import random

class DataAugmenter:
    def augment(self, text):
        words = text.split()
        
        # Random word deletion
        if random.random() < 0.3:
            words = [w for w in words if random.random() > 0.1]
        
        # Random word swap
        if len(words) > 1 and random.random() < 0.3:
            i, j = random.sample(range(len(words)), 2)
            words[i], words[j] = words[j], words[i]
        
        return ' '.join(words) if words else text
    
    def augment_dataset(self, dataset, n_augmentations=2):
        augmented = []
        
        for example in dataset:
            # Original
            augmented.append({
                **example, 
                'is_augmented': False
            })
            
            # Augmented versions
            for _ in range(n_augmentations):
                augmented.append({
                    **example,
                    'text': self.augment(example['text']),
                    'is_augmented': True
                })
        
        return Dataset.from_list(augmented)

# Kullanım: 1 örnek → 3 örnek (1 original + 2 augmented)
augmenter = DataAugmenter()
original = Dataset.from_dict({'text': ['Hello world'], 'label': [0]})
augmented = augmenter.augment_dataset(original, n_augmentations=2)
print(f"Dataset boyutu: {len(original)} → {len(augmented)}")
""",

    "Özel Görevler - Question Answering": """
from datasets import Dataset

# SQuAD-style QA dataset
qa_dataset = Dataset.from_dict({
    'context': [
        'The Eiffel Tower is in Paris. It was built in 1889.'
    ],
    'question': [
        'Where is the Eiffel Tower?'
    ],
    'answers': [{
        'text': ['Paris'],
        'answer_start': [23]  # Character position
    }]
})

# Preprocessing
def preprocess_qa(example):
    # Answer'ı validate et
    context = example['context']
    answer = example['answers']['text'][0]
    start = example['answers']['answer_start'][0]
    
    # Extract ve kontrol et
    extracted = context[start:start + len(answer)]
    is_valid = extracted == answer
    
    return {
        **example,
        'is_valid': is_valid,
        'question_type': example['question'].split()[0].lower()
    }

qa_processed = qa_dataset.map(preprocess_qa)
""",

    "Özel Görevler - NER": """
from datasets import Dataset

# Named Entity Recognition (BIO tagging)
ner_dataset = Dataset.from_dict({
    'tokens': [
        ['John', 'Smith', 'works', 'at', 'Google']
    ],
    'ner_tags': [
        ['B-PER', 'I-PER', 'O', 'O', 'B-ORG']
    ]
})

# Tag to ID mapping
tag2id = {
    'O': 0,
    'B-PER': 1, 'I-PER': 2,
    'B-ORG': 3, 'I-ORG': 4,
    'B-LOC': 5, 'I-LOC': 6
}

# Convert tags to IDs
def convert_tags(example):
    return {
        **example,
        'ner_tag_ids': [tag2id[tag] for tag in example['ner_tags']],
        'sentence': ' '.join(example['tokens'])
    }

ner_processed = ner_dataset.map(convert_tags)

# Entity statistics
def count_entities(dataset):
    entity_types = {}
    for ex in dataset:
        for tag in ex['ner_tags']:
            if tag.startswith('B-'):
                entity_type = tag.split('-')[1]
                entity_types[entity_type] = entity_types.get(entity_type, 0) + 1
    return entity_types

print(count_entities(ner_processed))
""",

    "Özel Görevler - Sentiment Analysis": """
from datasets import Dataset

# Sentiment classification dataset
sentiment_dataset = Dataset.from_dict({
    'text': [
        'This product is amazing!',
        'Terrible, waste of money.',
        'It\\'s okay, nothing special.'
    ],
    'label': [2, 0, 1],  # 0: negative, 1: neutral, 2: positive
    'label_text': ['positive', 'negative', 'neutral']
})

# Feature extraction
def extract_sentiment_features(example):
    text = example['text'].lower()
    
    positive_words = ['amazing', 'great', 'excellent', 'love']
    negative_words = ['terrible', 'waste', 'bad', 'poor']
    
    pos_count = sum(1 for word in positive_words if word in text)
    neg_count = sum(1 for word in negative_words if word in text)
    
    return {
        **example,
        'positive_words': pos_count,
        'negative_words': neg_count,
        'sentiment_score': pos_count - neg_count,
        'has_exclamation': '!' in example['text']
    }

sentiment_featured = sentiment_dataset.map(extract_sentiment_features)

# Class balancing with augmentation
def balance_classes(dataset, target_per_class=100):
    from collections import defaultdict
    
    # Group by label
    by_label = defaultdict(list)
    for ex in dataset:
        by_label[ex['label']].append(ex)
    
    # Augment minority classes
    balanced = []
    for label, examples in by_label.items():
        balanced.extend(examples)
        
        # Add augmented copies if needed
        while len([e for e in balanced if e['label'] == label]) < target_per_class:
            # Simple augmentation: copy with modified text
            ex = examples[len(balanced) % len(examples)]
            balanced.append({
                **ex,
                'is_augmented': True
            })
    
    return Dataset.from_list(balanced)
"""
}

BEST_PRACTICES = """
# 🎯 Best Practices Özeti

## Memory Efficiency
```python
# ✅ DOĞRU: Streaming
dataset = load_dataset("huge_data", streaming=True)

# ❌ YANLIŞ: Tüm veriyi RAM'e yükleme
dataset = load_dataset("huge_data")  # 100GB RAM!
```

## Batch Processing
```python
# ✅ DOĞRU: Batched=True
dataset.map(fn, batched=True, batch_size=1000)

# ❌ YANLIŞ: Tek tek
dataset.map(fn)  # 10x-100x yavaş!
```

## Cross-Domain
```python
# ✅ DOĞRU: Normalize et
def normalize(ex, domain):
    return {'text': ex.get('text'), 'domain': domain}

# ❌ YANLIŞ: Direkt birleştir
concatenate_datasets([ds1, ds2])  # Error!
```

## Performans
- **Streaming**: RAM tasarrufu
- **Batched**: 10x-100x hız
- **num_proc**: CPU parallelization
- **Cache**: Tekrar kullanım
"""

def show_code(module_name):
    """Seçilen modül için kod göster"""
    return DEMO_CODES.get(module_name, "Kod örneği yükleniyor...")

def show_best_practices():
    """Best practices göster"""
    return BEST_PRACTICES

# Gradio Interface
with gr.Blocks(title="Advanced Dataset Tutorial", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 📚 Advanced Dataset Tutorial
    ## Hugging Face Datasets - İleri Seviye Türkçe Eğitim
    
    Bu interaktif demo, 4 modül ve 20+ teknik içeren kapsamlı dataset eğitiminin özetini sunar.
    """)
    
    with gr.Tabs():
        with gr.Tab("🚀 Kod Örnekleri"):
            gr.Markdown("### Her modülden pratik kod örnekleri")
            
            module_dropdown = gr.Dropdown(
                choices=list(DEMO_CODES.keys()),
                label="Modül Seçin",
                value=list(DEMO_CODES.keys())[0]
            )
            
            code_output = gr.Code(
                label="Kod Örneği",
                language="python",
                value=DEMO_CODES[list(DEMO_CODES.keys())[0]]
            )
            
            module_dropdown.change(
                fn=show_code,
                inputs=[module_dropdown],
                outputs=[code_output]
            )
        
        with gr.Tab("📖 Modüller"):
            gr.Markdown("""
            ## 4 Ana Modül
            
            ### 1️⃣ Büyük Ölçekli Datasets
            - ⚡ Streaming (750GB+ data)
            - 💾 Batch processing (2.3x hızlı)
            - 🚀 Multi-processing (64x hızlı)
            - 📦 Cache (12.1x hızlı)
            
            ### 2️⃣ Domain-Specific Datasets
            - 🔬 Bilimsel makaleler (2,000 örnek)
            - 💻 Kod datasets (6 dil, 2,000 örnek)
            - 💰 Finansal veri (2,000 kayıt)
            - 🏥 Tıbbi veri (PHI anonymization)
            
            ### 3️⃣ İleri Teknikler
            - 📦 Custom Collators (3 tip)
            - 🔧 Feature Engineering (10+ feature)
            - 🎲 Data Augmentation (3x veri)
            - 📊 Advanced Sampling (diversity, stratified)
            
            ### 4️⃣ Özel Görevler
            - ❓ Question Answering (SQuAD)
            - 📝 Summarization (ROUGE)
            - 🏷️ NER (BIO tagging)
            - 😊 Sentiment Analysis
            - 📊 Multi-Task Learning
            """)
        
        with gr.Tab("🎯 Best Practices"):
            gr.Code(
                value=BEST_PRACTICES,
                label="Best Practices",
                language="python"
            )
        
        with gr.Tab("📊 Performans"):
            gr.Markdown("""
            ## Performans Metrikleri
            
            | Teknik | Artış | Kullanım |
            |--------|-------|----------|
            | **Batch Processing** | 2.3x | Tüm preprocessing |
            | **Cache** | 12.1x | Tekrar işlemler |
            | **Multi-Processing** | 64x | CPU tasks |
            | **Dynamic Batching** | 40% | Padding azalması |
            | **Data Augmentation** | 3x | Veri artışı |
            
            ## İstatistikler
            
            - 📝 **5,000+** kod satırı
            - 🔢 **20,000+** örnek dataset
            - 🛠️ **50+** teknik
            - ✅ **100+** best practice
            
            ## Kazanımlar
            
            ✅ Büyük ölçekli veri işleme  
            ✅ Domain-specific preprocessing  
            ✅ Production-ready pipelines  
            ✅ Task-specific optimization  
            ✅ Multi-task learning  
            """)
        
        with gr.Tab("ℹ️ Hakkında"):
            gr.Markdown("""
            ## Proje Bilgileri
            
            **Amaç:** Hugging Face Datasets kütüphanesini profesyonel düzeyde kullanmak isteyenler için kapsamlı Türkçe kaynak
            
            **İçerik:**
            - 4 ana modül
            - 20+ pratik örnek
            - 50+ teknik
            - 100+ best practice
            
            **Hedef Kitle:**
            - NLP mühendisleri
            - ML researchers
            - Data scientists
            - AI developers
            
            **Lisans:** MIT
            
            **Kaynaklar:**
            - [Hugging Face Datasets Docs](https://huggingface.co/docs/datasets)
            - [GitHub Repository](https://github.com/yourusername/advanced-dataset-tutorial)
            - [Hugging Face Hub](https://huggingface.co/datasets)
            
            ---
            
            ⭐ **Beğendiyseniz yıldız vermeyi unutmayın!**
            """)
    
    gr.Markdown("""
    ---
    💡 **Not:** Bu demo, tam eğitim materyalinin özeti içindir. Detaylı örnekler ve açıklamalar için modül scriptlerine bakın.
    """)

if __name__ == "__main__":
    demo.launch()