Spaces:

tugrulkaya
/

advanced-dataset-tutorial

Sleeping

advanced-dataset-tutorial / space /app.py

MEHMET TUĞRUL KAYA

Initial commit: Advanced Dataset Tutorial

2e6a47d 4 months ago

14.2 kB

	"""
	Advanced Dataset Tutorial - Interactive Gradio Demo
	===================================================

	Hugging Face Datasets ile ileri seviye teknikler için interaktif demo
	"""

	import gradio as gr
	import sys
	import os
	from pathlib import Path

	# Modülleri import edebilmek için path ekle
	sys.path.append(str(Path(__file__).parent / "modules"))

	# Demo için basit örnekler
	DEMO_CODES = {
	"Büyük Ölçekli - Streaming": """
	from datasets import load_dataset

	# Streaming mode - RAM'i patlatmadan büyük veri
	dataset = load_dataset(
	"c4",
	"en",
	split="train",
	streaming=True # ✨ Anahtar parametre
	)

	# İlk 1000 örneği işle
	for i, example in enumerate(dataset.take(1000)):
	print(f"Example {i}: {example['text'][:100]}...")
	""",

	"Büyük Ölçekli - Batch Processing": """
	from datasets import load_dataset

	dataset = load_dataset("imdb", split="train")

	# ❌ YAVAŞ: Tek tek işleme
	def process_single(example):
	return {'length': len(example['text'])}

	slow = dataset.map(process_single)

	# ✅ HIZLI: Batch processing
	def process_batch(examples):
	return {'length': [len(t) for t in examples['text']]}

	fast = dataset.map(
	process_batch,
	batched=True, # 🚀 10x-100x daha hızlı!
	batch_size=1000
	)
	""",

	"Domain-Specific - Cross-Domain Fix": """
	from datasets import Dataset, concatenate_datasets
	import json

	# ❌ PROBLEM: Farklı schema'lar
	sci_data = Dataset.from_dict({
	'text': ['Scientific paper...'],
	'metadata': {'year': 2024, 'citations': 10}
	})

	code_data = Dataset.from_dict({
	'code': ['def hello(): pass'],
	'language': 'Python'
	})

	# Bu HATA verir! ArrowTypeError
	# combined = concatenate_datasets([sci_data, code_data])

	# ✅ ÇÖZÜM: JSON metadata approach
	def normalize_to_json(example, domain):
	return {
	'text': example.get('text') or example.get('code'),
	'domain': domain,
	'metadata_json': json.dumps(example.get('metadata', {}))
	}

	sci_norm = sci_data.map(lambda x: normalize_to_json(x, 'scientific'))
	code_norm = code_data.map(lambda x: normalize_to_json(x, 'code'))

	# Şimdi ÇALIŞIR! ✅
	combined = concatenate_datasets([sci_norm, code_norm])
	""",

	"İleri Teknikler - Custom Collator": """
	from datasets import Dataset

	class AdvancedCollator:
	def __init__(self, max_length=128, pad_token='[PAD]'):
	self.max_length = max_length
	self.pad_token = pad_token

	def __call__(self, batch):
	# Tokenize (basit örnek)
	tokenized = [ex['text'].split()[:self.max_length]
	for ex in batch]

	# Dynamic padding - batch içindeki max length'e göre
	max_len = max(len(tokens) for tokens in tokenized)

	padded = []
	masks = []
	for tokens in tokenized:
	pad_len = max_len - len(tokens)
	padded.append(tokens + [self.pad_token] * pad_len)
	masks.append([1] * len(tokens) + [0] * pad_len)

	return {
	'input_tokens': padded,
	'attention_mask': masks,
	'labels': [ex['label'] for ex in batch]
	}

	# Kullanım
	collator = AdvancedCollator()
	batch = [
	{'text': 'Short text', 'label': 0},
	{'text': 'Much longer text here', 'label': 1}
	]
	collated = collator(batch)
	""",

	"İleri Teknikler - Data Augmentation": """
	from datasets import Dataset
	import random

	class DataAugmenter:
	def augment(self, text):
	words = text.split()

	# Random word deletion
	if random.random() < 0.3:
	words = [w for w in words if random.random() > 0.1]

	# Random word swap
	if len(words) > 1 and random.random() < 0.3:
	i, j = random.sample(range(len(words)), 2)
	words[i], words[j] = words[j], words[i]

	return ' '.join(words) if words else text

	def augment_dataset(self, dataset, n_augmentations=2):
	augmented = []

	for example in dataset:
	# Original
	augmented.append({
	**example,
	'is_augmented': False
	})

	# Augmented versions
	for _ in range(n_augmentations):
	augmented.append({
	**example,
	'text': self.augment(example['text']),
	'is_augmented': True
	})

	return Dataset.from_list(augmented)

	# Kullanım: 1 örnek → 3 örnek (1 original + 2 augmented)
	augmenter = DataAugmenter()
	original = Dataset.from_dict({'text': ['Hello world'], 'label': [0]})
	augmented = augmenter.augment_dataset(original, n_augmentations=2)
	print(f"Dataset boyutu: {len(original)} → {len(augmented)}")
	""",

	"Özel Görevler - Question Answering": """
	from datasets import Dataset

	# SQuAD-style QA dataset
	qa_dataset = Dataset.from_dict({
	'context': [
	'The Eiffel Tower is in Paris. It was built in 1889.'
	],
	'question': [
	'Where is the Eiffel Tower?'
	],
	'answers': [{
	'text': ['Paris'],
	'answer_start': [23] # Character position
	}]
	})

	# Preprocessing
	def preprocess_qa(example):
	# Answer'ı validate et
	context = example['context']
	answer = example['answers']['text'][0]
	start = example['answers']['answer_start'][0]

	# Extract ve kontrol et
	extracted = context[start:start + len(answer)]
	is_valid = extracted == answer

	return {
	**example,
	'is_valid': is_valid,
	'question_type': example['question'].split()[0].lower()
	}

	qa_processed = qa_dataset.map(preprocess_qa)
	""",

	"Özel Görevler - NER": """
	from datasets import Dataset

	# Named Entity Recognition (BIO tagging)
	ner_dataset = Dataset.from_dict({
	'tokens': [
	['John', 'Smith', 'works', 'at', 'Google']
	],
	'ner_tags': [
	['B-PER', 'I-PER', 'O', 'O', 'B-ORG']
	]
	})

	# Tag to ID mapping
	tag2id = {
	'O': 0,
	'B-PER': 1, 'I-PER': 2,
	'B-ORG': 3, 'I-ORG': 4,
	'B-LOC': 5, 'I-LOC': 6
	}

	# Convert tags to IDs
	def convert_tags(example):
	return {
	**example,
	'ner_tag_ids': [tag2id[tag] for tag in example['ner_tags']],
	'sentence': ' '.join(example['tokens'])
	}

	ner_processed = ner_dataset.map(convert_tags)

	# Entity statistics
	def count_entities(dataset):
	entity_types = {}
	for ex in dataset:
	for tag in ex['ner_tags']:
	if tag.startswith('B-'):
	entity_type = tag.split('-')[1]
	entity_types[entity_type] = entity_types.get(entity_type, 0) + 1
	return entity_types

	print(count_entities(ner_processed))
	""",

	"Özel Görevler - Sentiment Analysis": """
	from datasets import Dataset

	# Sentiment classification dataset
	sentiment_dataset = Dataset.from_dict({
	'text': [
	'This product is amazing!',
	'Terrible, waste of money.',
	'It\\'s okay, nothing special.'
	],
	'label': [2, 0, 1], # 0: negative, 1: neutral, 2: positive
	'label_text': ['positive', 'negative', 'neutral']
	})

	# Feature extraction
	def extract_sentiment_features(example):
	text = example['text'].lower()

	positive_words = ['amazing', 'great', 'excellent', 'love']
	negative_words = ['terrible', 'waste', 'bad', 'poor']

	pos_count = sum(1 for word in positive_words if word in text)
	neg_count = sum(1 for word in negative_words if word in text)

	return {
	**example,
	'positive_words': pos_count,
	'negative_words': neg_count,
	'sentiment_score': pos_count - neg_count,
	'has_exclamation': '!' in example['text']
	}

	sentiment_featured = sentiment_dataset.map(extract_sentiment_features)

	# Class balancing with augmentation
	def balance_classes(dataset, target_per_class=100):
	from collections import defaultdict

	# Group by label
	by_label = defaultdict(list)
	for ex in dataset:
	by_label[ex['label']].append(ex)

	# Augment minority classes
	balanced = []
	for label, examples in by_label.items():
	balanced.extend(examples)

	# Add augmented copies if needed
	while len([e for e in balanced if e['label'] == label]) < target_per_class:
	# Simple augmentation: copy with modified text
	ex = examples[len(balanced) % len(examples)]
	balanced.append({
	**ex,
	'is_augmented': True
	})

	return Dataset.from_list(balanced)
	"""
	}

	BEST_PRACTICES = """
	# 🎯 Best Practices Özeti

	## Memory Efficiency
	```python
	# ✅ DOĞRU: Streaming
	dataset = load_dataset("huge_data", streaming=True)

	# ❌ YANLIŞ: Tüm veriyi RAM'e yükleme
	dataset = load_dataset("huge_data") # 100GB RAM!
	```

	## Batch Processing
	```python
	# ✅ DOĞRU: Batched=True
	dataset.map(fn, batched=True, batch_size=1000)

	# ❌ YANLIŞ: Tek tek
	dataset.map(fn) # 10x-100x yavaş!
	```

	## Cross-Domain
	```python
	# ✅ DOĞRU: Normalize et
	def normalize(ex, domain):
	return {'text': ex.get('text'), 'domain': domain}

	# ❌ YANLIŞ: Direkt birleştir
	concatenate_datasets([ds1, ds2]) # Error!
	```

	## Performans
	- Streaming: RAM tasarrufu
	- Batched: 10x-100x hız
	- num_proc: CPU parallelization
	- Cache: Tekrar kullanım
	"""

	def show_code(module_name):
	"""Seçilen modül için kod göster"""
	return DEMO_CODES.get(module_name, "Kod örneği yükleniyor...")

	def show_best_practices():
	"""Best practices göster"""
	return BEST_PRACTICES

	# Gradio Interface
	with gr.Blocks(title="Advanced Dataset Tutorial", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 📚 Advanced Dataset Tutorial
	## Hugging Face Datasets - İleri Seviye Türkçe Eğitim

	Bu interaktif demo, 4 modül ve 20+ teknik içeren kapsamlı dataset eğitiminin özetini sunar.
	""")

	with gr.Tabs():
	with gr.Tab("🚀 Kod Örnekleri"):
	gr.Markdown("### Her modülden pratik kod örnekleri")

	module_dropdown = gr.Dropdown(
	choices=list(DEMO_CODES.keys()),
	label="Modül Seçin",
	value=list(DEMO_CODES.keys())[0]
	)

	code_output = gr.Code(
	label="Kod Örneği",
	language="python",
	value=DEMO_CODES[list(DEMO_CODES.keys())[0]]
	)

	module_dropdown.change(
	fn=show_code,
	inputs=[module_dropdown],
	outputs=[code_output]
	)

	with gr.Tab("📖 Modüller"):
	gr.Markdown("""
	## 4 Ana Modül

	### 1️⃣ Büyük Ölçekli Datasets
	- ⚡ Streaming (750GB+ data)
	- 💾 Batch processing (2.3x hızlı)
	- 🚀 Multi-processing (64x hızlı)
	- 📦 Cache (12.1x hızlı)

	### 2️⃣ Domain-Specific Datasets
	- 🔬 Bilimsel makaleler (2,000 örnek)
	- 💻 Kod datasets (6 dil, 2,000 örnek)
	- 💰 Finansal veri (2,000 kayıt)
	- 🏥 Tıbbi veri (PHI anonymization)

	### 3️⃣ İleri Teknikler
	- 📦 Custom Collators (3 tip)
	- 🔧 Feature Engineering (10+ feature)
	- 🎲 Data Augmentation (3x veri)
	- 📊 Advanced Sampling (diversity, stratified)

	### 4️⃣ Özel Görevler
	- ❓ Question Answering (SQuAD)
	- 📝 Summarization (ROUGE)
	- 🏷️ NER (BIO tagging)
	- 😊 Sentiment Analysis
	- 📊 Multi-Task Learning
	""")

	with gr.Tab("🎯 Best Practices"):
	gr.Code(
	value=BEST_PRACTICES,
	label="Best Practices",
	language="python"
	)

	with gr.Tab("📊 Performans"):
	gr.Markdown("""
	## Performans Metrikleri

	\| Teknik \| Artış \| Kullanım \|
	\|--------\|-------\|----------\|
	\| Batch Processing \| 2.3x \| Tüm preprocessing \|
	\| Cache \| 12.1x \| Tekrar işlemler \|
	\| Multi-Processing \| 64x \| CPU tasks \|
	\| Dynamic Batching \| 40% \| Padding azalması \|
	\| Data Augmentation \| 3x \| Veri artışı \|

	## İstatistikler

	- 📝 5,000+ kod satırı
	- 🔢 20,000+ örnek dataset
	- 🛠️ 50+ teknik
	- ✅ 100+ best practice

	## Kazanımlar

	✅ Büyük ölçekli veri işleme
	✅ Domain-specific preprocessing
	✅ Production-ready pipelines
	✅ Task-specific optimization
	✅ Multi-task learning
	""")

	with gr.Tab("ℹ️ Hakkında"):
	gr.Markdown("""
	## Proje Bilgileri

	Amaç: Hugging Face Datasets kütüphanesini profesyonel düzeyde kullanmak isteyenler için kapsamlı Türkçe kaynak

	İçerik:
	- 4 ana modül
	- 20+ pratik örnek
	- 50+ teknik
	- 100+ best practice

	Hedef Kitle:
	- NLP mühendisleri
	- ML researchers
	- Data scientists
	- AI developers

	Lisans: MIT

	Kaynaklar:
	- [Hugging Face Datasets Docs](https://huggingface.co/docs/datasets)
	- [GitHub Repository](https://github.com/yourusername/advanced-dataset-tutorial)
	- [Hugging Face Hub](https://huggingface.co/datasets)

	---

	⭐ Beğendiyseniz yıldız vermeyi unutmayın!
	""")

	gr.Markdown("""
	---
	💡 Not: Bu demo, tam eğitim materyalinin özeti içindir. Detaylı örnekler ve açıklamalar için modül scriptlerine bakın.
	""")

	if __name__ == "__main__":
	demo.launch()