|
|
""" |
|
|
Advanced Dataset Tutorial - Interactive Gradio Demo |
|
|
=================================================== |
|
|
|
|
|
Hugging Face Datasets ile ileri seviye teknikler için interaktif demo |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import sys |
|
|
import os |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
sys.path.append(str(Path(__file__).parent / "modules")) |
|
|
|
|
|
|
|
|
DEMO_CODES = { |
|
|
"Büyük Ölçekli - Streaming": """ |
|
|
from datasets import load_dataset |
|
|
|
|
|
# Streaming mode - RAM'i patlatmadan büyük veri |
|
|
dataset = load_dataset( |
|
|
"c4", |
|
|
"en", |
|
|
split="train", |
|
|
streaming=True # ✨ Anahtar parametre |
|
|
) |
|
|
|
|
|
# İlk 1000 örneği işle |
|
|
for i, example in enumerate(dataset.take(1000)): |
|
|
print(f"Example {i}: {example['text'][:100]}...") |
|
|
""", |
|
|
|
|
|
"Büyük Ölçekli - Batch Processing": """ |
|
|
from datasets import load_dataset |
|
|
|
|
|
dataset = load_dataset("imdb", split="train") |
|
|
|
|
|
# ❌ YAVAŞ: Tek tek işleme |
|
|
def process_single(example): |
|
|
return {'length': len(example['text'])} |
|
|
|
|
|
slow = dataset.map(process_single) |
|
|
|
|
|
# ✅ HIZLI: Batch processing |
|
|
def process_batch(examples): |
|
|
return {'length': [len(t) for t in examples['text']]} |
|
|
|
|
|
fast = dataset.map( |
|
|
process_batch, |
|
|
batched=True, # 🚀 10x-100x daha hızlı! |
|
|
batch_size=1000 |
|
|
) |
|
|
""", |
|
|
|
|
|
"Domain-Specific - Cross-Domain Fix": """ |
|
|
from datasets import Dataset, concatenate_datasets |
|
|
import json |
|
|
|
|
|
# ❌ PROBLEM: Farklı schema'lar |
|
|
sci_data = Dataset.from_dict({ |
|
|
'text': ['Scientific paper...'], |
|
|
'metadata': {'year': 2024, 'citations': 10} |
|
|
}) |
|
|
|
|
|
code_data = Dataset.from_dict({ |
|
|
'code': ['def hello(): pass'], |
|
|
'language': 'Python' |
|
|
}) |
|
|
|
|
|
# Bu HATA verir! ArrowTypeError |
|
|
# combined = concatenate_datasets([sci_data, code_data]) |
|
|
|
|
|
# ✅ ÇÖZÜM: JSON metadata approach |
|
|
def normalize_to_json(example, domain): |
|
|
return { |
|
|
'text': example.get('text') or example.get('code'), |
|
|
'domain': domain, |
|
|
'metadata_json': json.dumps(example.get('metadata', {})) |
|
|
} |
|
|
|
|
|
sci_norm = sci_data.map(lambda x: normalize_to_json(x, 'scientific')) |
|
|
code_norm = code_data.map(lambda x: normalize_to_json(x, 'code')) |
|
|
|
|
|
# Şimdi ÇALIŞIR! ✅ |
|
|
combined = concatenate_datasets([sci_norm, code_norm]) |
|
|
""", |
|
|
|
|
|
"İleri Teknikler - Custom Collator": """ |
|
|
from datasets import Dataset |
|
|
|
|
|
class AdvancedCollator: |
|
|
def __init__(self, max_length=128, pad_token='[PAD]'): |
|
|
self.max_length = max_length |
|
|
self.pad_token = pad_token |
|
|
|
|
|
def __call__(self, batch): |
|
|
# Tokenize (basit örnek) |
|
|
tokenized = [ex['text'].split()[:self.max_length] |
|
|
for ex in batch] |
|
|
|
|
|
# Dynamic padding - batch içindeki max length'e göre |
|
|
max_len = max(len(tokens) for tokens in tokenized) |
|
|
|
|
|
padded = [] |
|
|
masks = [] |
|
|
for tokens in tokenized: |
|
|
pad_len = max_len - len(tokens) |
|
|
padded.append(tokens + [self.pad_token] * pad_len) |
|
|
masks.append([1] * len(tokens) + [0] * pad_len) |
|
|
|
|
|
return { |
|
|
'input_tokens': padded, |
|
|
'attention_mask': masks, |
|
|
'labels': [ex['label'] for ex in batch] |
|
|
} |
|
|
|
|
|
# Kullanım |
|
|
collator = AdvancedCollator() |
|
|
batch = [ |
|
|
{'text': 'Short text', 'label': 0}, |
|
|
{'text': 'Much longer text here', 'label': 1} |
|
|
] |
|
|
collated = collator(batch) |
|
|
""", |
|
|
|
|
|
"İleri Teknikler - Data Augmentation": """ |
|
|
from datasets import Dataset |
|
|
import random |
|
|
|
|
|
class DataAugmenter: |
|
|
def augment(self, text): |
|
|
words = text.split() |
|
|
|
|
|
# Random word deletion |
|
|
if random.random() < 0.3: |
|
|
words = [w for w in words if random.random() > 0.1] |
|
|
|
|
|
# Random word swap |
|
|
if len(words) > 1 and random.random() < 0.3: |
|
|
i, j = random.sample(range(len(words)), 2) |
|
|
words[i], words[j] = words[j], words[i] |
|
|
|
|
|
return ' '.join(words) if words else text |
|
|
|
|
|
def augment_dataset(self, dataset, n_augmentations=2): |
|
|
augmented = [] |
|
|
|
|
|
for example in dataset: |
|
|
# Original |
|
|
augmented.append({ |
|
|
**example, |
|
|
'is_augmented': False |
|
|
}) |
|
|
|
|
|
# Augmented versions |
|
|
for _ in range(n_augmentations): |
|
|
augmented.append({ |
|
|
**example, |
|
|
'text': self.augment(example['text']), |
|
|
'is_augmented': True |
|
|
}) |
|
|
|
|
|
return Dataset.from_list(augmented) |
|
|
|
|
|
# Kullanım: 1 örnek → 3 örnek (1 original + 2 augmented) |
|
|
augmenter = DataAugmenter() |
|
|
original = Dataset.from_dict({'text': ['Hello world'], 'label': [0]}) |
|
|
augmented = augmenter.augment_dataset(original, n_augmentations=2) |
|
|
print(f"Dataset boyutu: {len(original)} → {len(augmented)}") |
|
|
""", |
|
|
|
|
|
"Özel Görevler - Question Answering": """ |
|
|
from datasets import Dataset |
|
|
|
|
|
# SQuAD-style QA dataset |
|
|
qa_dataset = Dataset.from_dict({ |
|
|
'context': [ |
|
|
'The Eiffel Tower is in Paris. It was built in 1889.' |
|
|
], |
|
|
'question': [ |
|
|
'Where is the Eiffel Tower?' |
|
|
], |
|
|
'answers': [{ |
|
|
'text': ['Paris'], |
|
|
'answer_start': [23] # Character position |
|
|
}] |
|
|
}) |
|
|
|
|
|
# Preprocessing |
|
|
def preprocess_qa(example): |
|
|
# Answer'ı validate et |
|
|
context = example['context'] |
|
|
answer = example['answers']['text'][0] |
|
|
start = example['answers']['answer_start'][0] |
|
|
|
|
|
# Extract ve kontrol et |
|
|
extracted = context[start:start + len(answer)] |
|
|
is_valid = extracted == answer |
|
|
|
|
|
return { |
|
|
**example, |
|
|
'is_valid': is_valid, |
|
|
'question_type': example['question'].split()[0].lower() |
|
|
} |
|
|
|
|
|
qa_processed = qa_dataset.map(preprocess_qa) |
|
|
""", |
|
|
|
|
|
"Özel Görevler - NER": """ |
|
|
from datasets import Dataset |
|
|
|
|
|
# Named Entity Recognition (BIO tagging) |
|
|
ner_dataset = Dataset.from_dict({ |
|
|
'tokens': [ |
|
|
['John', 'Smith', 'works', 'at', 'Google'] |
|
|
], |
|
|
'ner_tags': [ |
|
|
['B-PER', 'I-PER', 'O', 'O', 'B-ORG'] |
|
|
] |
|
|
}) |
|
|
|
|
|
# Tag to ID mapping |
|
|
tag2id = { |
|
|
'O': 0, |
|
|
'B-PER': 1, 'I-PER': 2, |
|
|
'B-ORG': 3, 'I-ORG': 4, |
|
|
'B-LOC': 5, 'I-LOC': 6 |
|
|
} |
|
|
|
|
|
# Convert tags to IDs |
|
|
def convert_tags(example): |
|
|
return { |
|
|
**example, |
|
|
'ner_tag_ids': [tag2id[tag] for tag in example['ner_tags']], |
|
|
'sentence': ' '.join(example['tokens']) |
|
|
} |
|
|
|
|
|
ner_processed = ner_dataset.map(convert_tags) |
|
|
|
|
|
# Entity statistics |
|
|
def count_entities(dataset): |
|
|
entity_types = {} |
|
|
for ex in dataset: |
|
|
for tag in ex['ner_tags']: |
|
|
if tag.startswith('B-'): |
|
|
entity_type = tag.split('-')[1] |
|
|
entity_types[entity_type] = entity_types.get(entity_type, 0) + 1 |
|
|
return entity_types |
|
|
|
|
|
print(count_entities(ner_processed)) |
|
|
""", |
|
|
|
|
|
"Özel Görevler - Sentiment Analysis": """ |
|
|
from datasets import Dataset |
|
|
|
|
|
# Sentiment classification dataset |
|
|
sentiment_dataset = Dataset.from_dict({ |
|
|
'text': [ |
|
|
'This product is amazing!', |
|
|
'Terrible, waste of money.', |
|
|
'It\\'s okay, nothing special.' |
|
|
], |
|
|
'label': [2, 0, 1], # 0: negative, 1: neutral, 2: positive |
|
|
'label_text': ['positive', 'negative', 'neutral'] |
|
|
}) |
|
|
|
|
|
# Feature extraction |
|
|
def extract_sentiment_features(example): |
|
|
text = example['text'].lower() |
|
|
|
|
|
positive_words = ['amazing', 'great', 'excellent', 'love'] |
|
|
negative_words = ['terrible', 'waste', 'bad', 'poor'] |
|
|
|
|
|
pos_count = sum(1 for word in positive_words if word in text) |
|
|
neg_count = sum(1 for word in negative_words if word in text) |
|
|
|
|
|
return { |
|
|
**example, |
|
|
'positive_words': pos_count, |
|
|
'negative_words': neg_count, |
|
|
'sentiment_score': pos_count - neg_count, |
|
|
'has_exclamation': '!' in example['text'] |
|
|
} |
|
|
|
|
|
sentiment_featured = sentiment_dataset.map(extract_sentiment_features) |
|
|
|
|
|
# Class balancing with augmentation |
|
|
def balance_classes(dataset, target_per_class=100): |
|
|
from collections import defaultdict |
|
|
|
|
|
# Group by label |
|
|
by_label = defaultdict(list) |
|
|
for ex in dataset: |
|
|
by_label[ex['label']].append(ex) |
|
|
|
|
|
# Augment minority classes |
|
|
balanced = [] |
|
|
for label, examples in by_label.items(): |
|
|
balanced.extend(examples) |
|
|
|
|
|
# Add augmented copies if needed |
|
|
while len([e for e in balanced if e['label'] == label]) < target_per_class: |
|
|
# Simple augmentation: copy with modified text |
|
|
ex = examples[len(balanced) % len(examples)] |
|
|
balanced.append({ |
|
|
**ex, |
|
|
'is_augmented': True |
|
|
}) |
|
|
|
|
|
return Dataset.from_list(balanced) |
|
|
""" |
|
|
} |
|
|
|
|
|
BEST_PRACTICES = """ |
|
|
# 🎯 Best Practices Özeti |
|
|
|
|
|
## Memory Efficiency |
|
|
```python |
|
|
# ✅ DOĞRU: Streaming |
|
|
dataset = load_dataset("huge_data", streaming=True) |
|
|
|
|
|
# ❌ YANLIŞ: Tüm veriyi RAM'e yükleme |
|
|
dataset = load_dataset("huge_data") # 100GB RAM! |
|
|
``` |
|
|
|
|
|
## Batch Processing |
|
|
```python |
|
|
# ✅ DOĞRU: Batched=True |
|
|
dataset.map(fn, batched=True, batch_size=1000) |
|
|
|
|
|
# ❌ YANLIŞ: Tek tek |
|
|
dataset.map(fn) # 10x-100x yavaş! |
|
|
``` |
|
|
|
|
|
## Cross-Domain |
|
|
```python |
|
|
# ✅ DOĞRU: Normalize et |
|
|
def normalize(ex, domain): |
|
|
return {'text': ex.get('text'), 'domain': domain} |
|
|
|
|
|
# ❌ YANLIŞ: Direkt birleştir |
|
|
concatenate_datasets([ds1, ds2]) # Error! |
|
|
``` |
|
|
|
|
|
## Performans |
|
|
- **Streaming**: RAM tasarrufu |
|
|
- **Batched**: 10x-100x hız |
|
|
- **num_proc**: CPU parallelization |
|
|
- **Cache**: Tekrar kullanım |
|
|
""" |
|
|
|
|
|
def show_code(module_name): |
|
|
"""Seçilen modül için kod göster""" |
|
|
return DEMO_CODES.get(module_name, "Kod örneği yükleniyor...") |
|
|
|
|
|
def show_best_practices(): |
|
|
"""Best practices göster""" |
|
|
return BEST_PRACTICES |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Advanced Dataset Tutorial", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# 📚 Advanced Dataset Tutorial |
|
|
## Hugging Face Datasets - İleri Seviye Türkçe Eğitim |
|
|
|
|
|
Bu interaktif demo, 4 modül ve 20+ teknik içeren kapsamlı dataset eğitiminin özetini sunar. |
|
|
""") |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.Tab("🚀 Kod Örnekleri"): |
|
|
gr.Markdown("### Her modülden pratik kod örnekleri") |
|
|
|
|
|
module_dropdown = gr.Dropdown( |
|
|
choices=list(DEMO_CODES.keys()), |
|
|
label="Modül Seçin", |
|
|
value=list(DEMO_CODES.keys())[0] |
|
|
) |
|
|
|
|
|
code_output = gr.Code( |
|
|
label="Kod Örneği", |
|
|
language="python", |
|
|
value=DEMO_CODES[list(DEMO_CODES.keys())[0]] |
|
|
) |
|
|
|
|
|
module_dropdown.change( |
|
|
fn=show_code, |
|
|
inputs=[module_dropdown], |
|
|
outputs=[code_output] |
|
|
) |
|
|
|
|
|
with gr.Tab("📖 Modüller"): |
|
|
gr.Markdown(""" |
|
|
## 4 Ana Modül |
|
|
|
|
|
### 1️⃣ Büyük Ölçekli Datasets |
|
|
- ⚡ Streaming (750GB+ data) |
|
|
- 💾 Batch processing (2.3x hızlı) |
|
|
- 🚀 Multi-processing (64x hızlı) |
|
|
- 📦 Cache (12.1x hızlı) |
|
|
|
|
|
### 2️⃣ Domain-Specific Datasets |
|
|
- 🔬 Bilimsel makaleler (2,000 örnek) |
|
|
- 💻 Kod datasets (6 dil, 2,000 örnek) |
|
|
- 💰 Finansal veri (2,000 kayıt) |
|
|
- 🏥 Tıbbi veri (PHI anonymization) |
|
|
|
|
|
### 3️⃣ İleri Teknikler |
|
|
- 📦 Custom Collators (3 tip) |
|
|
- 🔧 Feature Engineering (10+ feature) |
|
|
- 🎲 Data Augmentation (3x veri) |
|
|
- 📊 Advanced Sampling (diversity, stratified) |
|
|
|
|
|
### 4️⃣ Özel Görevler |
|
|
- ❓ Question Answering (SQuAD) |
|
|
- 📝 Summarization (ROUGE) |
|
|
- 🏷️ NER (BIO tagging) |
|
|
- 😊 Sentiment Analysis |
|
|
- 📊 Multi-Task Learning |
|
|
""") |
|
|
|
|
|
with gr.Tab("🎯 Best Practices"): |
|
|
gr.Code( |
|
|
value=BEST_PRACTICES, |
|
|
label="Best Practices", |
|
|
language="python" |
|
|
) |
|
|
|
|
|
with gr.Tab("📊 Performans"): |
|
|
gr.Markdown(""" |
|
|
## Performans Metrikleri |
|
|
|
|
|
| Teknik | Artış | Kullanım | |
|
|
|--------|-------|----------| |
|
|
| **Batch Processing** | 2.3x | Tüm preprocessing | |
|
|
| **Cache** | 12.1x | Tekrar işlemler | |
|
|
| **Multi-Processing** | 64x | CPU tasks | |
|
|
| **Dynamic Batching** | 40% | Padding azalması | |
|
|
| **Data Augmentation** | 3x | Veri artışı | |
|
|
|
|
|
## İstatistikler |
|
|
|
|
|
- 📝 **5,000+** kod satırı |
|
|
- 🔢 **20,000+** örnek dataset |
|
|
- 🛠️ **50+** teknik |
|
|
- ✅ **100+** best practice |
|
|
|
|
|
## Kazanımlar |
|
|
|
|
|
✅ Büyük ölçekli veri işleme |
|
|
✅ Domain-specific preprocessing |
|
|
✅ Production-ready pipelines |
|
|
✅ Task-specific optimization |
|
|
✅ Multi-task learning |
|
|
""") |
|
|
|
|
|
with gr.Tab("ℹ️ Hakkında"): |
|
|
gr.Markdown(""" |
|
|
## Proje Bilgileri |
|
|
|
|
|
**Amaç:** Hugging Face Datasets kütüphanesini profesyonel düzeyde kullanmak isteyenler için kapsamlı Türkçe kaynak |
|
|
|
|
|
**İçerik:** |
|
|
- 4 ana modül |
|
|
- 20+ pratik örnek |
|
|
- 50+ teknik |
|
|
- 100+ best practice |
|
|
|
|
|
**Hedef Kitle:** |
|
|
- NLP mühendisleri |
|
|
- ML researchers |
|
|
- Data scientists |
|
|
- AI developers |
|
|
|
|
|
**Lisans:** MIT |
|
|
|
|
|
**Kaynaklar:** |
|
|
- [Hugging Face Datasets Docs](https://huggingface.co/docs/datasets) |
|
|
- [GitHub Repository](https://github.com/yourusername/advanced-dataset-tutorial) |
|
|
- [Hugging Face Hub](https://huggingface.co/datasets) |
|
|
|
|
|
--- |
|
|
|
|
|
⭐ **Beğendiyseniz yıldız vermeyi unutmayın!** |
|
|
""") |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
💡 **Not:** Bu demo, tam eğitim materyalinin özeti içindir. Detaylı örnekler ve açıklamalar için modül scriptlerine bakın. |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|