MEHMET TUĞRUL KAYA
Initial commit: Advanced Dataset Tutorial
2e6a47d
"""
Advanced Dataset Tutorial - Interactive Gradio Demo
===================================================
Hugging Face Datasets ile ileri seviye teknikler için interaktif demo
"""
import gradio as gr
import sys
import os
from pathlib import Path
# Modülleri import edebilmek için path ekle
sys.path.append(str(Path(__file__).parent / "modules"))
# Demo için basit örnekler
DEMO_CODES = {
"Büyük Ölçekli - Streaming": """
from datasets import load_dataset
# Streaming mode - RAM'i patlatmadan büyük veri
dataset = load_dataset(
"c4",
"en",
split="train",
streaming=True # ✨ Anahtar parametre
)
# İlk 1000 örneği işle
for i, example in enumerate(dataset.take(1000)):
print(f"Example {i}: {example['text'][:100]}...")
""",
"Büyük Ölçekli - Batch Processing": """
from datasets import load_dataset
dataset = load_dataset("imdb", split="train")
# ❌ YAVAŞ: Tek tek işleme
def process_single(example):
return {'length': len(example['text'])}
slow = dataset.map(process_single)
# ✅ HIZLI: Batch processing
def process_batch(examples):
return {'length': [len(t) for t in examples['text']]}
fast = dataset.map(
process_batch,
batched=True, # 🚀 10x-100x daha hızlı!
batch_size=1000
)
""",
"Domain-Specific - Cross-Domain Fix": """
from datasets import Dataset, concatenate_datasets
import json
# ❌ PROBLEM: Farklı schema'lar
sci_data = Dataset.from_dict({
'text': ['Scientific paper...'],
'metadata': {'year': 2024, 'citations': 10}
})
code_data = Dataset.from_dict({
'code': ['def hello(): pass'],
'language': 'Python'
})
# Bu HATA verir! ArrowTypeError
# combined = concatenate_datasets([sci_data, code_data])
# ✅ ÇÖZÜM: JSON metadata approach
def normalize_to_json(example, domain):
return {
'text': example.get('text') or example.get('code'),
'domain': domain,
'metadata_json': json.dumps(example.get('metadata', {}))
}
sci_norm = sci_data.map(lambda x: normalize_to_json(x, 'scientific'))
code_norm = code_data.map(lambda x: normalize_to_json(x, 'code'))
# Şimdi ÇALIŞIR! ✅
combined = concatenate_datasets([sci_norm, code_norm])
""",
"İleri Teknikler - Custom Collator": """
from datasets import Dataset
class AdvancedCollator:
def __init__(self, max_length=128, pad_token='[PAD]'):
self.max_length = max_length
self.pad_token = pad_token
def __call__(self, batch):
# Tokenize (basit örnek)
tokenized = [ex['text'].split()[:self.max_length]
for ex in batch]
# Dynamic padding - batch içindeki max length'e göre
max_len = max(len(tokens) for tokens in tokenized)
padded = []
masks = []
for tokens in tokenized:
pad_len = max_len - len(tokens)
padded.append(tokens + [self.pad_token] * pad_len)
masks.append([1] * len(tokens) + [0] * pad_len)
return {
'input_tokens': padded,
'attention_mask': masks,
'labels': [ex['label'] for ex in batch]
}
# Kullanım
collator = AdvancedCollator()
batch = [
{'text': 'Short text', 'label': 0},
{'text': 'Much longer text here', 'label': 1}
]
collated = collator(batch)
""",
"İleri Teknikler - Data Augmentation": """
from datasets import Dataset
import random
class DataAugmenter:
def augment(self, text):
words = text.split()
# Random word deletion
if random.random() < 0.3:
words = [w for w in words if random.random() > 0.1]
# Random word swap
if len(words) > 1 and random.random() < 0.3:
i, j = random.sample(range(len(words)), 2)
words[i], words[j] = words[j], words[i]
return ' '.join(words) if words else text
def augment_dataset(self, dataset, n_augmentations=2):
augmented = []
for example in dataset:
# Original
augmented.append({
**example,
'is_augmented': False
})
# Augmented versions
for _ in range(n_augmentations):
augmented.append({
**example,
'text': self.augment(example['text']),
'is_augmented': True
})
return Dataset.from_list(augmented)
# Kullanım: 1 örnek → 3 örnek (1 original + 2 augmented)
augmenter = DataAugmenter()
original = Dataset.from_dict({'text': ['Hello world'], 'label': [0]})
augmented = augmenter.augment_dataset(original, n_augmentations=2)
print(f"Dataset boyutu: {len(original)} → {len(augmented)}")
""",
"Özel Görevler - Question Answering": """
from datasets import Dataset
# SQuAD-style QA dataset
qa_dataset = Dataset.from_dict({
'context': [
'The Eiffel Tower is in Paris. It was built in 1889.'
],
'question': [
'Where is the Eiffel Tower?'
],
'answers': [{
'text': ['Paris'],
'answer_start': [23] # Character position
}]
})
# Preprocessing
def preprocess_qa(example):
# Answer'ı validate et
context = example['context']
answer = example['answers']['text'][0]
start = example['answers']['answer_start'][0]
# Extract ve kontrol et
extracted = context[start:start + len(answer)]
is_valid = extracted == answer
return {
**example,
'is_valid': is_valid,
'question_type': example['question'].split()[0].lower()
}
qa_processed = qa_dataset.map(preprocess_qa)
""",
"Özel Görevler - NER": """
from datasets import Dataset
# Named Entity Recognition (BIO tagging)
ner_dataset = Dataset.from_dict({
'tokens': [
['John', 'Smith', 'works', 'at', 'Google']
],
'ner_tags': [
['B-PER', 'I-PER', 'O', 'O', 'B-ORG']
]
})
# Tag to ID mapping
tag2id = {
'O': 0,
'B-PER': 1, 'I-PER': 2,
'B-ORG': 3, 'I-ORG': 4,
'B-LOC': 5, 'I-LOC': 6
}
# Convert tags to IDs
def convert_tags(example):
return {
**example,
'ner_tag_ids': [tag2id[tag] for tag in example['ner_tags']],
'sentence': ' '.join(example['tokens'])
}
ner_processed = ner_dataset.map(convert_tags)
# Entity statistics
def count_entities(dataset):
entity_types = {}
for ex in dataset:
for tag in ex['ner_tags']:
if tag.startswith('B-'):
entity_type = tag.split('-')[1]
entity_types[entity_type] = entity_types.get(entity_type, 0) + 1
return entity_types
print(count_entities(ner_processed))
""",
"Özel Görevler - Sentiment Analysis": """
from datasets import Dataset
# Sentiment classification dataset
sentiment_dataset = Dataset.from_dict({
'text': [
'This product is amazing!',
'Terrible, waste of money.',
'It\\'s okay, nothing special.'
],
'label': [2, 0, 1], # 0: negative, 1: neutral, 2: positive
'label_text': ['positive', 'negative', 'neutral']
})
# Feature extraction
def extract_sentiment_features(example):
text = example['text'].lower()
positive_words = ['amazing', 'great', 'excellent', 'love']
negative_words = ['terrible', 'waste', 'bad', 'poor']
pos_count = sum(1 for word in positive_words if word in text)
neg_count = sum(1 for word in negative_words if word in text)
return {
**example,
'positive_words': pos_count,
'negative_words': neg_count,
'sentiment_score': pos_count - neg_count,
'has_exclamation': '!' in example['text']
}
sentiment_featured = sentiment_dataset.map(extract_sentiment_features)
# Class balancing with augmentation
def balance_classes(dataset, target_per_class=100):
from collections import defaultdict
# Group by label
by_label = defaultdict(list)
for ex in dataset:
by_label[ex['label']].append(ex)
# Augment minority classes
balanced = []
for label, examples in by_label.items():
balanced.extend(examples)
# Add augmented copies if needed
while len([e for e in balanced if e['label'] == label]) < target_per_class:
# Simple augmentation: copy with modified text
ex = examples[len(balanced) % len(examples)]
balanced.append({
**ex,
'is_augmented': True
})
return Dataset.from_list(balanced)
"""
}
BEST_PRACTICES = """
# 🎯 Best Practices Özeti
## Memory Efficiency
```python
# ✅ DOĞRU: Streaming
dataset = load_dataset("huge_data", streaming=True)
# ❌ YANLIŞ: Tüm veriyi RAM'e yükleme
dataset = load_dataset("huge_data") # 100GB RAM!
```
## Batch Processing
```python
# ✅ DOĞRU: Batched=True
dataset.map(fn, batched=True, batch_size=1000)
# ❌ YANLIŞ: Tek tek
dataset.map(fn) # 10x-100x yavaş!
```
## Cross-Domain
```python
# ✅ DOĞRU: Normalize et
def normalize(ex, domain):
return {'text': ex.get('text'), 'domain': domain}
# ❌ YANLIŞ: Direkt birleştir
concatenate_datasets([ds1, ds2]) # Error!
```
## Performans
- **Streaming**: RAM tasarrufu
- **Batched**: 10x-100x hız
- **num_proc**: CPU parallelization
- **Cache**: Tekrar kullanım
"""
def show_code(module_name):
"""Seçilen modül için kod göster"""
return DEMO_CODES.get(module_name, "Kod örneği yükleniyor...")
def show_best_practices():
"""Best practices göster"""
return BEST_PRACTICES
# Gradio Interface
with gr.Blocks(title="Advanced Dataset Tutorial", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 📚 Advanced Dataset Tutorial
## Hugging Face Datasets - İleri Seviye Türkçe Eğitim
Bu interaktif demo, 4 modül ve 20+ teknik içeren kapsamlı dataset eğitiminin özetini sunar.
""")
with gr.Tabs():
with gr.Tab("🚀 Kod Örnekleri"):
gr.Markdown("### Her modülden pratik kod örnekleri")
module_dropdown = gr.Dropdown(
choices=list(DEMO_CODES.keys()),
label="Modül Seçin",
value=list(DEMO_CODES.keys())[0]
)
code_output = gr.Code(
label="Kod Örneği",
language="python",
value=DEMO_CODES[list(DEMO_CODES.keys())[0]]
)
module_dropdown.change(
fn=show_code,
inputs=[module_dropdown],
outputs=[code_output]
)
with gr.Tab("📖 Modüller"):
gr.Markdown("""
## 4 Ana Modül
### 1️⃣ Büyük Ölçekli Datasets
- ⚡ Streaming (750GB+ data)
- 💾 Batch processing (2.3x hızlı)
- 🚀 Multi-processing (64x hızlı)
- 📦 Cache (12.1x hızlı)
### 2️⃣ Domain-Specific Datasets
- 🔬 Bilimsel makaleler (2,000 örnek)
- 💻 Kod datasets (6 dil, 2,000 örnek)
- 💰 Finansal veri (2,000 kayıt)
- 🏥 Tıbbi veri (PHI anonymization)
### 3️⃣ İleri Teknikler
- 📦 Custom Collators (3 tip)
- 🔧 Feature Engineering (10+ feature)
- 🎲 Data Augmentation (3x veri)
- 📊 Advanced Sampling (diversity, stratified)
### 4️⃣ Özel Görevler
- ❓ Question Answering (SQuAD)
- 📝 Summarization (ROUGE)
- 🏷️ NER (BIO tagging)
- 😊 Sentiment Analysis
- 📊 Multi-Task Learning
""")
with gr.Tab("🎯 Best Practices"):
gr.Code(
value=BEST_PRACTICES,
label="Best Practices",
language="python"
)
with gr.Tab("📊 Performans"):
gr.Markdown("""
## Performans Metrikleri
| Teknik | Artış | Kullanım |
|--------|-------|----------|
| **Batch Processing** | 2.3x | Tüm preprocessing |
| **Cache** | 12.1x | Tekrar işlemler |
| **Multi-Processing** | 64x | CPU tasks |
| **Dynamic Batching** | 40% | Padding azalması |
| **Data Augmentation** | 3x | Veri artışı |
## İstatistikler
- 📝 **5,000+** kod satırı
- 🔢 **20,000+** örnek dataset
- 🛠️ **50+** teknik
- ✅ **100+** best practice
## Kazanımlar
✅ Büyük ölçekli veri işleme
✅ Domain-specific preprocessing
✅ Production-ready pipelines
✅ Task-specific optimization
✅ Multi-task learning
""")
with gr.Tab("ℹ️ Hakkında"):
gr.Markdown("""
## Proje Bilgileri
**Amaç:** Hugging Face Datasets kütüphanesini profesyonel düzeyde kullanmak isteyenler için kapsamlı Türkçe kaynak
**İçerik:**
- 4 ana modül
- 20+ pratik örnek
- 50+ teknik
- 100+ best practice
**Hedef Kitle:**
- NLP mühendisleri
- ML researchers
- Data scientists
- AI developers
**Lisans:** MIT
**Kaynaklar:**
- [Hugging Face Datasets Docs](https://huggingface.co/docs/datasets)
- [GitHub Repository](https://github.com/yourusername/advanced-dataset-tutorial)
- [Hugging Face Hub](https://huggingface.co/datasets)
---
⭐ **Beğendiyseniz yıldız vermeyi unutmayın!**
""")
gr.Markdown("""
---
💡 **Not:** Bu demo, tam eğitim materyalinin özeti içindir. Detaylı örnekler ve açıklamalar için modül scriptlerine bakın.
""")
if __name__ == "__main__":
demo.launch()