advanced-dataset-tutorial / space /modules /02_domain_specific_datasets.py
MEHMET TUĞRUL KAYA
Initial commit: Advanced Dataset Tutorial
2e6a47d
"""
DOMAIN-SPECIFIC DATASETS - İLERİ SEVİYE HUGGING FACE
====================================================
Bu modülde öğrenecekleriniz:
1. Bilimsel Makaleler (arXiv, PubMed) - Academic datasets
2. Kod Datasets (The Stack, CodeParrot) - Programming datasets
3. Finansal Analiz Datasets - Finance & Business
4. Tıbbi/Sağlık Datasets - Medical & Healthcare
5. Domain-specific preprocessing
6. Custom tokenization
7. Domain adaptation techniques
"""
from datasets import Dataset, load_dataset, DatasetDict
import numpy as np
import json
from typing import Dict, List
import time
from collections import Counter
import re
print("="*70)
print("🔬 DOMAIN-SPECIFIC DATASETS - İLERİ SEVİYE")
print("="*70)
print("\n" + "="*70)
print("1. BİLİMSEL MAKALELER - ACADEMIC DATASETS")
print("="*70)
# Sentetik bilimsel makale dataset'i
def generate_scientific_papers(num_samples=1000):
"""
Bilimsel makale formatında sentetik veri
"""
domains = ['Physics', 'Computer Science', 'Biology', 'Mathematics', 'Chemistry']
def gen():
for i in range(num_samples):
domain = np.random.choice(domains)
# Makale yapısı
abstract = f"This paper presents a novel approach to {domain.lower()} research. " \
f"We propose a methodology that addresses key challenges in the field. " \
f"Our experimental results show significant improvements over baseline methods. " \
f"The proposed framework demonstrates applicability across multiple scenarios."
yield {
'id': f'arxiv.{i:06d}',
'title': f'Advanced Methods in {domain} Research: A Comprehensive Study {i}',
'abstract': abstract,
'authors': [f'Author {j}' for j in range(np.random.randint(2, 6))],
'domain': domain,
'year': np.random.randint(2015, 2025),
'citations': np.random.randint(0, 500),
'keywords': [f'keyword{j}' for j in range(np.random.randint(3, 8))],
'full_text': abstract + " " + abstract * np.random.randint(5, 15)
}
return Dataset.from_generator(gen)
print("\n📚 Bilimsel Makale Dataset'i Oluşturuluyor...")
scientific_dataset = generate_scientific_papers(2000)
print(f"✅ {len(scientific_dataset)} bilimsel makale yüklendi")
print(f"\nÖrnek makale:")
sample = scientific_dataset[0]
print(f" ID: {sample['id']}")
print(f" Başlık: {sample['title']}")
print(f" Domain: {sample['domain']}")
print(f" Yazar sayısı: {len(sample['authors'])}")
print(f" Yıl: {sample['year']}")
print(f" Atıf sayısı: {sample['citations']}")
print(f" Abstract: {sample['abstract'][:150]}...")
# Domain bazlı istatistikler
print("\n📊 Domain Dağılımı:")
domains = [ex['domain'] for ex in scientific_dataset]
domain_counts = Counter(domains)
for domain, count in domain_counts.most_common():
pct = (count / len(scientific_dataset)) * 100
print(f" {domain}: {count} ({pct:.1f}%)")
# Yıllara göre analiz
print("\n📅 Yıllara Göre Yayın Sayısı:")
years = [ex['year'] for ex in scientific_dataset]
year_counts = Counter(years)
for year in sorted(year_counts.keys())[-5:]:
print(f" {year}: {year_counts[year]} makale")
# Atıf analizi
citations = [ex['citations'] for ex in scientific_dataset]
print(f"\n📈 Atıf İstatistikleri:")
print(f" Ortalama: {np.mean(citations):.1f}")
print(f" Median: {np.median(citations):.1f}")
print(f" En çok atıf: {np.max(citations)}")
# Preprocessing - Bilimsel text temizleme
print("\n🔧 Bilimsel Text Preprocessing:")
def preprocess_scientific_text(examples):
"""
Bilimsel metin için özel preprocessing
"""
processed = []
for text in examples['abstract']:
# Küçük harfe çevir
text = text.lower()
# Özel karakterleri temizle
text = re.sub(r'[^\w\s\.]', '', text)
# Fazla boşlukları temizle
text = ' '.join(text.split())
processed.append(text)
return {
'abstract_clean': processed,
'abstract_length': [len(t) for t in processed],
'word_count': [len(t.split()) for t in processed]
}
scientific_processed = scientific_dataset.map(
preprocess_scientific_text,
batched=True,
batch_size=500,
desc="Preprocessing scientific texts"
)
print(f"✅ {len(scientific_processed)} makale işlendi")
print(f"\nÖrnek işlenmiş abstract:")
print(f" Original: {scientific_processed[0]['abstract'][:100]}...")
print(f" Cleaned: {scientific_processed[0]['abstract_clean'][:100]}...")
print(f" Word count: {scientific_processed[0]['word_count']}")
print("\n" + "="*70)
print("2. KOD DATASETS - PROGRAMMING & SOFTWARE")
print("="*70)
# Sentetik kod dataset'i
def generate_code_dataset(num_samples=1000):
"""
Çeşitli programlama dilleri için kod örnekleri
"""
languages = ['Python', 'JavaScript', 'Java', 'C++', 'Go', 'Rust']
code_templates = {
'Python': '''def {func_name}({params}):
"""
{docstring}
"""
result = {body}
return result''',
'JavaScript': '''function {func_name}({params}) {{
// {docstring}
const result = {body};
return result;
}}''',
'Java': '''public {return_type} {func_name}({params}) {{
// {docstring}
{return_type} result = {body};
return result;
}}''',
}
def gen():
for i in range(num_samples):
lang = np.random.choice(languages)
# Kod özellikleri
func_name = f"process_data_{i}"
params = "data, config"
docstring = f"Process data using method {i}"
body = "data * 2 + config"
if lang in code_templates:
code = code_templates[lang].format(
func_name=func_name,
params=params,
docstring=docstring,
body=body,
return_type='int' if lang == 'Java' else ''
)
else:
code = f"// {lang} code example\n{func_name}({params})"
yield {
'id': f'code_{i:06d}',
'language': lang,
'code': code,
'func_name': func_name,
'lines_of_code': len(code.split('\n')),
'has_docstring': 'docstring' in code.lower(),
'complexity': np.random.choice(['low', 'medium', 'high']),
'repo': f'github.com/user/repo_{i % 100}',
'stars': np.random.randint(0, 10000)
}
return Dataset.from_generator(gen)
print("\n💻 Kod Dataset'i Oluşturuluyor...")
code_dataset = generate_code_dataset(2000)
print(f"✅ {len(code_dataset)} kod örneği yüklendi")
print(f"\nÖrnek kod:")
code_sample = code_dataset[0]
print(f" ID: {code_sample['id']}")
print(f" Dil: {code_sample['language']}")
print(f" Satır sayısı: {code_sample['lines_of_code']}")
print(f" Karmaşıklık: {code_sample['complexity']}")
print(f"\n Kod:\n{code_sample['code']}\n")
# Dil dağılımı
print("\n📊 Programlama Dili Dağılımı:")
languages = [ex['language'] for ex in code_dataset]
lang_counts = Counter(languages)
for lang, count in lang_counts.most_common():
pct = (count / len(code_dataset)) * 100
print(f" {lang}: {count} ({pct:.1f}%)")
# Kod analizi
print("\n📈 Kod Metrikleri:")
loc_values = [ex['lines_of_code'] for ex in code_dataset]
print(f" Ortalama satır sayısı: {np.mean(loc_values):.1f}")
print(f" Median satır sayısı: {np.median(loc_values):.1f}")
has_docstring = sum([1 for ex in code_dataset if ex['has_docstring']])
print(f" Docstring oranı: {(has_docstring/len(code_dataset)*100):.1f}%")
# Kod preprocessing
print("\n🔧 Kod Preprocessing:")
def preprocess_code(examples):
"""
Kod için özel preprocessing
"""
def extract_functions(code):
# Fonksiyon isimlerini çıkar (basit regex)
funcs = re.findall(r'def\s+(\w+)|function\s+(\w+)|public\s+\w+\s+(\w+)', code)
return [f for group in funcs for f in group if f]
def count_comments(code):
# Yorum satırlarını say
return len(re.findall(r'#|//|/\*|\*/', code))
return {
'functions': [extract_functions(code) for code in examples['code']],
'comment_count': [count_comments(code) for code in examples['code']],
'code_chars': [len(code) for code in examples['code']],
'code_tokens': [len(code.split()) for code in examples['code']]
}
code_processed = code_dataset.map(
preprocess_code,
batched=True,
batch_size=500,
desc="Analyzing code"
)
print(f"✅ {len(code_processed)} kod örneği analiz edildi")
print(f"\nÖrnek analiz:")
print(f" Fonksiyonlar: {code_processed[0]['functions']}")
print(f" Yorum sayısı: {code_processed[0]['comment_count']}")
print(f" Token sayısı: {code_processed[0]['code_tokens']}")
print("\n" + "="*70)
print("3. FİNANSAL ANALİZ DATASETS")
print("="*70)
# Sentetik finansal veri
def generate_financial_dataset(num_samples=1000):
"""
Finansal haber ve analiz dataset'i
"""
companies = ['TechCorp', 'FinanceBank', 'RetailCo', 'EnergyInc', 'HealthMed']
sentiments = ['positive', 'negative', 'neutral']
categories = ['earnings', 'merger', 'product_launch', 'scandal', 'expansion']
def gen():
for i in range(num_samples):
company = np.random.choice(companies)
sentiment = np.random.choice(sentiments)
category = np.random.choice(categories)
# Finansal haber metni
if sentiment == 'positive':
text = f"{company} announces strong quarterly earnings, exceeding market expectations. " \
f"Stock prices surged following the announcement. Analysts remain optimistic."
elif sentiment == 'negative':
text = f"{company} faces challenges in the current market. " \
f"Quarterly results fell short of expectations. Investors express concern."
else:
text = f"{company} maintains steady performance in Q{i%4+1}. " \
f"Market reaction remains moderate. Company outlook unchanged."
yield {
'id': f'fin_{i:06d}',
'company': company,
'text': text,
'sentiment': sentiment,
'category': category,
'date': f'2024-{(i%12)+1:02d}-{(i%28)+1:02d}',
'stock_change': np.random.uniform(-10, 10),
'volume': np.random.randint(1000000, 10000000),
'market_cap': np.random.uniform(1e9, 100e9),
'sector': np.random.choice(['Tech', 'Finance', 'Retail', 'Energy', 'Healthcare'])
}
return Dataset.from_generator(gen)
print("\n💰 Finansal Dataset Oluşturuluyor...")
financial_dataset = generate_financial_dataset(2000)
print(f"✅ {len(financial_dataset)} finansal kayıt yüklendi")
print(f"\nÖrnek finansal kayıt:")
fin_sample = financial_dataset[0]
print(f" ID: {fin_sample['id']}")
print(f" Şirket: {fin_sample['company']}")
print(f" Sentiment: {fin_sample['sentiment']}")
print(f" Kategori: {fin_sample['category']}")
print(f" Hisse değişimi: {fin_sample['stock_change']:.2f}%")
print(f" Metin: {fin_sample['text'][:120]}...")
# Sentiment analizi
print("\n📊 Sentiment Dağılımı:")
sentiments = [ex['sentiment'] for ex in financial_dataset]
sent_counts = Counter(sentiments)
for sent, count in sent_counts.items():
pct = (count / len(financial_dataset)) * 100
print(f" {sent.capitalize()}: {count} ({pct:.1f}%)")
# Şirket bazlı analiz
print("\n🏢 Şirket Bazlı Analiz:")
companies = [ex['company'] for ex in financial_dataset]
company_counts = Counter(companies)
for company, count in company_counts.most_common():
avg_change = np.mean([ex['stock_change'] for ex in financial_dataset if ex['company'] == company])
print(f" {company}: {count} haber, ortalama değişim: {avg_change:+.2f}%")
# Finansal preprocessing
print("\n🔧 Finansal Text Preprocessing:")
def preprocess_financial_text(examples):
"""
Finansal metin için özel preprocessing
"""
def extract_numbers(text):
# Sayıları ve yüzdeleri çıkar
numbers = re.findall(r'\d+\.?\d*%?', text)
return numbers
def extract_financial_terms(text):
# Finansal terimleri say
terms = ['earnings', 'stock', 'market', 'quarterly', 'revenue',
'profit', 'loss', 'growth', 'decline']
count = sum([1 for term in terms if term in text.lower()])
return count
return {
'numbers_found': [extract_numbers(text) for text in examples['text']],
'financial_term_count': [extract_financial_terms(text) for text in examples['text']],
'text_length': [len(text) for text in examples['text']],
'has_percentage': ['%' in text for text in examples['text']]
}
financial_processed = financial_dataset.map(
preprocess_financial_text,
batched=True,
batch_size=500,
desc="Processing financial texts"
)
print(f"✅ {len(financial_processed)} finansal kayıt işlendi")
print(f"\nÖrnek analiz:")
print(f" Sayılar: {financial_processed[0]['numbers_found']}")
print(f" Finansal terim sayısı: {financial_processed[0]['financial_term_count']}")
print(f" Yüzde var mı: {financial_processed[0]['has_percentage']}")
print("\n" + "="*70)
print("4. TIBBİ/SAĞLIK DATASETS")
print("="*70)
# Sentetik tıbbi veri
def generate_medical_dataset(num_samples=1000):
"""
Tıbbi notlar ve tanılar
"""
conditions = ['Diabetes', 'Hypertension', 'Asthma', 'Arthritis', 'Migraine']
treatments = ['Medication', 'Physical Therapy', 'Surgery', 'Lifestyle Changes']
severities = ['mild', 'moderate', 'severe']
def gen():
for i in range(num_samples):
condition = np.random.choice(conditions)
treatment = np.random.choice(treatments)
severity = np.random.choice(severities)
# Tıbbi not
note = f"Patient presents with {severity} {condition.lower()}. " \
f"Symptoms include relevant clinical findings. " \
f"Recommended treatment: {treatment}. " \
f"Follow-up scheduled. Patient advised on preventive measures."
yield {
'id': f'med_{i:06d}',
'patient_id': f'P{i:05d}',
'condition': condition,
'severity': severity,
'treatment': treatment,
'note': note,
'age': np.random.randint(18, 90),
'gender': np.random.choice(['M', 'F']),
'visit_date': f'2024-{(i%12)+1:02d}-{(i%28)+1:02d}',
'diagnosis_confidence': np.random.uniform(0.7, 1.0),
'follow_up_required': np.random.choice([True, False])
}
return Dataset.from_generator(gen)
print("\n🏥 Tıbbi Dataset Oluşturuluyor...")
medical_dataset = generate_medical_dataset(2000)
print(f"✅ {len(medical_dataset)} tıbbi kayıt yüklendi")
print(f"\nÖrnek tıbbi kayıt:")
med_sample = medical_dataset[0]
print(f" ID: {med_sample['id']}")
print(f" Hasta ID: {med_sample['patient_id']}")
print(f" Durum: {med_sample['condition']}")
print(f" Şiddet: {med_sample['severity']}")
print(f" Tedavi: {med_sample['treatment']}")
print(f" Yaş: {med_sample['age']}")
print(f" Tanı güveni: {med_sample['diagnosis_confidence']:.2f}")
print(f" Not: {med_sample['note'][:100]}...")
# Durum dağılımı
print("\n📊 Tıbbi Durum Dağılımı:")
conditions = [ex['condition'] for ex in medical_dataset]
cond_counts = Counter(conditions)
for cond, count in cond_counts.most_common():
pct = (count / len(medical_dataset)) * 100
print(f" {cond}: {count} ({pct:.1f}%)")
# Şiddet analizi
print("\n⚠️ Şiddet Dağılımı:")
severities = [ex['severity'] for ex in medical_dataset]
sev_counts = Counter(severities)
for sev, count in sorted(sev_counts.items()):
pct = (count / len(medical_dataset)) * 100
print(f" {sev.capitalize()}: {count} ({pct:.1f}%)")
# Yaş grupları
print("\n👥 Yaş Grubu Analizi:")
ages = [ex['age'] for ex in medical_dataset]
age_groups = {
'18-30': sum([1 for age in ages if 18 <= age <= 30]),
'31-50': sum([1 for age in ages if 31 <= age <= 50]),
'51-70': sum([1 for age in ages if 51 <= age <= 70]),
'71+': sum([1 for age in ages if age > 70])
}
for group, count in age_groups.items():
pct = (count / len(ages)) * 100
print(f" {group}: {count} ({pct:.1f}%)")
# Tıbbi preprocessing
print("\n🔧 Tıbbi Text Preprocessing (PHI Removal):")
def preprocess_medical_text(examples):
"""
Tıbbi metin için özel preprocessing
PHI (Protected Health Information) temizleme simülasyonu
"""
def anonymize_text(text, patient_id):
# Hasta ID'lerini anonimleştir
text = text.replace(patient_id, '[PATIENT_ID]')
# Tarihleri anonimleştir
text = re.sub(r'\d{4}-\d{2}-\d{2}', '[DATE]', text)
return text
def extract_medical_entities(text):
# Tıbbi terimleri say (basit örnek)
terms = ['patient', 'symptoms', 'treatment', 'diagnosis',
'medication', 'therapy', 'condition']
count = sum([1 for term in terms if term in text.lower()])
return count
return {
'note_anonymized': [
anonymize_text(note, pid)
for note, pid in zip(examples['note'], examples['patient_id'])
],
'medical_entity_count': [extract_medical_entities(note) for note in examples['note']],
'note_length': [len(note) for note in examples['note']],
'requires_follow_up': examples['follow_up_required']
}
medical_processed = medical_dataset.map(
preprocess_medical_text,
batched=True,
batch_size=500,
desc="Anonymizing medical records"
)
print(f"✅ {len(medical_processed)} tıbbi kayıt anonimleştirildi")
print(f"\nÖrnek anonimleştirilmiş not:")
print(f" Orijinal: {medical_processed[0]['note'][:100]}...")
print(f" Anonimleştirilmiş: {medical_processed[0]['note_anonymized'][:100]}...")
print(f" Tıbbi entity sayısı: {medical_processed[0]['medical_entity_count']}")
print("\n" + "="*70)
print("5. DOMAIN-SPECIFIC TOKENIZATION")
print("="*70)
print("\n🔤 Domain-Specific Tokenization Stratejileri:")
# Bilimsel metin için
print("\n1️⃣ Bilimsel Metin Tokenization:")
scientific_sample = scientific_dataset[0]['abstract']
print(f" Orijinal: {scientific_sample[:80]}...")
# Basit word tokenization
words = scientific_sample.split()
print(f" Word tokens: {len(words)} kelime")
print(f" İlk 5 token: {words[:5]}")
# Sentence tokenization
sentences = scientific_sample.split('.')
print(f" Sentence tokens: {len([s for s in sentences if s.strip()])} cümle")
# Kod için
print("\n2️⃣ Kod Tokenization:")
code_sample = code_dataset[0]['code']
print(f" Kod:\n{code_sample}")
# Satır bazlı
lines = code_sample.split('\n')
print(f" Satır sayısı: {len(lines)}")
# Token bazlı (basit)
code_tokens = re.findall(r'\w+|[^\w\s]', code_sample)
print(f" Token sayısı: {len(code_tokens)}")
print(f" İlk 10 token: {code_tokens[:10]}")
print("\n" + "="*70)
print("6. CROSS-DOMAIN DATASET BİRLEŞTİRME")
print("="*70)
print("\n🔄 Farklı Domain'lerden Dataset Birleştirme:")
# Her domain'den küçük subset al
sci_subset = scientific_dataset.select(range(100))
code_subset = code_dataset.select(range(100))
fin_subset = financial_dataset.select(range(100))
# Ortak format'a çevir
def normalize_scientific(example):
return {
'text': example['abstract'],
'domain': 'scientific',
'metadata': {
'type': example['domain'],
'year': example['year']
}
}
def normalize_code(example):
return {
'text': example['code'],
'domain': 'code',
'metadata': {
'language': example['language'],
'lines': example['lines_of_code']
}
}
def normalize_financial(example):
return {
'text': example['text'],
'domain': 'financial',
'metadata': {
'sentiment': example['sentiment'],
'company': example['company']
}
}
print("\n📦 Dataset'leri normalize ediyoruz...")
sci_norm = sci_subset.map(normalize_scientific, remove_columns=sci_subset.column_names)
code_norm = code_subset.map(normalize_code, remove_columns=code_subset.column_names)
fin_norm = fin_subset.map(normalize_financial, remove_columns=fin_subset.column_names)
# Birleştir
from datasets import concatenate_datasets
multi_domain = concatenate_datasets([sci_norm, code_norm, fin_norm])
print(f"✅ Multi-domain dataset: {len(multi_domain)} örnek")
print(f"\nDomain dağılımı:")
domains = [ex['domain'] for ex in multi_domain]
domain_dist = Counter(domains)
for domain, count in domain_dist.items():
print(f" {domain}: {count}")
print(f"\nÖrnek multi-domain kayıtlar:")
for i in range(3):
ex = multi_domain[i * 100] # Her domain'den birer örnek
print(f"\n {i+1}. Domain: {ex['domain']}")
print(f" Text: {ex['text'][:80]}...")
print(f" Metadata: {ex['metadata']}")
print("\n" + "="*70)
print("7. DOMAIN ADAPTATION TEKNİKLERİ")
print("="*70)
print("\n🎯 Domain Adaptation Stratejileri:")
# Örnek: Genel domain'den specific domain'e transfer
print("\n1️⃣ Domain-Specific Vocabulary Analysis:")
def analyze_domain_vocabulary(dataset, text_column, domain_name):
"""
Domain-specific kelime dağarcığı analizi
"""
all_words = []
for example in dataset:
words = example[text_column].lower().split()
all_words.extend(words)
vocab_counts = Counter(all_words)
return {
'domain': domain_name,
'total_words': len(all_words),
'unique_words': len(vocab_counts),
'top_10_words': vocab_counts.most_common(10)
}
# Her domain için vocabulary analizi
sci_vocab = analyze_domain_vocabulary(
scientific_dataset.select(range(500)),
'abstract',
'Scientific'
)
code_vocab = analyze_domain_vocabulary(
code_dataset.select(range(500)),
'code',
'Code'
)
fin_vocab = analyze_domain_vocabulary(
financial_dataset.select(range(500)),
'text',
'Financial'
)
print("\n📚 Domain Vocabulary İstatistikleri:")
for vocab in [sci_vocab, code_vocab, fin_vocab]:
print(f"\n {vocab['domain']}:")
print(f" Toplam kelime: {vocab['total_words']:,}")
print(f" Benzersiz kelime: {vocab['unique_words']:,}")
print(f" Vocabulary zenginliği: {vocab['unique_words']/vocab['total_words']:.3f}")
print(f" Top 5 kelime: {[w for w, c in vocab['top_10_words'][:5]]}")
print("\n2️⃣ Domain-Specific Data Augmentation:")
def augment_scientific_text(example):
"""
Bilimsel metin için data augmentation
"""
text = example['abstract']
# Synonym replacement (basit simülasyon)
augmented = text.replace('novel', 'innovative')
augmented = augmented.replace('propose', 'present')
augmented = augmented.replace('demonstrate', 'show')
return {
**example,
'abstract_augmented': augmented
}
print("\n Bilimsel metin augmentation örneği:")
aug_sample = augment_scientific_text(scientific_dataset[0])
print(f" Original: {aug_sample['abstract'][:100]}...")
print(f" Augmented: {aug_sample['abstract_augmented'][:100]}...")
print("\n3️⃣ Domain-Specific Filtering:")
def filter_high_quality_scientific(example):
"""
Yüksek kaliteli bilimsel makaleleri filtrele
"""
return (
example['citations'] > 50 and # Çok atıf almış
example['year'] >= 2020 and # Son yıllarda yayınlanmış
len(example['abstract'].split()) > 100 # Detaylı abstract
)
high_quality_sci = scientific_dataset.filter(
filter_high_quality_scientific,
desc="Filtering high-quality papers"
)
print(f"\n Kaliteli makale filtreleme:")
print(f" Orijinal: {len(scientific_dataset)} makale")
print(f" Filtrelenmiş: {len(high_quality_sci)} makale")
print(f" Oran: {len(high_quality_sci)/len(scientific_dataset)*100:.1f}%")
print("\n" + "="*70)
print("8. DOMAIN-SPECIFIC EVALUATION METRİKLERİ")
print("="*70)
print("\n📊 Domain-Specific Kalite Metrikleri:")
def calculate_domain_metrics(dataset, domain_name):
"""
Domain-specific kalite metrikleri
"""
if domain_name == 'scientific':
# Bilimsel metrikler
avg_citations = np.mean([ex['citations'] for ex in dataset])
avg_authors = np.mean([len(ex['authors']) for ex in dataset])
recent_papers = sum([1 for ex in dataset if ex['year'] >= 2020])
return {
'domain': domain_name,
'avg_citations': avg_citations,
'avg_authors': avg_authors,
'recent_ratio': recent_papers / len(dataset)
}
elif domain_name == 'code':
# Kod metrikleri
avg_loc = np.mean([ex['lines_of_code'] for ex in dataset])
has_doc = sum([1 for ex in dataset if ex['has_docstring']])
high_stars = sum([1 for ex in dataset if ex['stars'] > 1000])
return {
'domain': domain_name,
'avg_lines_of_code': avg_loc,
'documentation_ratio': has_doc / len(dataset),
'popular_ratio': high_stars / len(dataset)
}
elif domain_name == 'financial':
# Finansal metrikler
sentiments = [ex['sentiment'] for ex in dataset]
sent_dist = Counter(sentiments)
avg_change = np.mean([ex['stock_change'] for ex in dataset])
return {
'domain': domain_name,
'sentiment_distribution': dict(sent_dist),
'avg_stock_change': avg_change,
'volatility': np.std([ex['stock_change'] for ex in dataset])
}
print("\n1️⃣ Scientific Metrics:")
sci_metrics = calculate_domain_metrics(scientific_dataset, 'scientific')
for key, value in sci_metrics.items():
print(f" {key}: {value}")
print("\n2️⃣ Code Metrics:")
code_metrics = calculate_domain_metrics(code_dataset, 'code')
for key, value in code_metrics.items():
print(f" {key}: {value}")
print("\n3️⃣ Financial Metrics:")
fin_metrics = calculate_domain_metrics(financial_dataset, 'financial')
for key, value in fin_metrics.items():
print(f" {key}: {value}")
print("\n" + "="*70)
print("9. BEST PRACTICES - DOMAIN-SPECIFIC DATASETS")
print("="*70)
print("""
✅ BİLİMSEL DATASETS:
- Citation metadata ekle
- Abstract + full text ayrımı
- Domain/field classification
- Author disambiguation
- Reference parsing
- LaTeX formül handling
✅ KOD DATASETS:
- Programlama dili ayrımı
- Syntax parsing
- Docstring extraction
- Repository metadata
- License bilgisi
- Code quality metrics (complexity, coverage)
✅ FİNANSAL DATASETS:
- Sentiment annotation
- Entity recognition (companies, people)
- Temporal information
- Numerical data extraction
- Market data integration
- Real-time updates
✅ TIBBİ DATASETS:
- PHI (Protected Health Information) removal
- HIPAA compliance
- Clinical terminology standardization
- ICD code mapping
- Anonymization
- Ethical considerations
✅ GENEL PRENSİPLER:
- Domain expertise gerekir
- Specialized tokenization
- Domain-specific validation
- Quality filtering
- Ethical guidelines takip et
- License ve copyright kontrol et
✅ DATA QUALITY:
- Domain experts ile validate et
- Inter-annotator agreement hesapla
- Bias analysis yap
- Coverage analysis
- Statistical validation
- Regular updates
""")
print("\n" + "="*70)
print("✅ BÖLÜM 2 TAMAMLANDI!")
print("="*70)
print(f"""
Bu bölümde öğrendikleriniz:
✓ Bilimsel makale datasets ({len(scientific_dataset)} örnek)
✓ Kod datasets ({len(code_dataset)} örnek)
✓ Finansal analiz datasets ({len(financial_dataset)} örnek)
✓ Tıbbi/sağlık datasets ({len(medical_dataset)} örnek)
✓ Domain-specific preprocessing
✓ Cross-domain dataset birleştirme
✓ Domain adaptation teknikleri
✓ Domain-specific evaluation metrikleri
📊 ÜRETİLEN DATASETS:
- Scientific: {len(scientific_dataset):,} makale
- Code: {len(code_dataset):,} kod örneği
- Financial: {len(financial_dataset):,} finansal kayıt
- Medical: {len(medical_dataset):,} tıbbi kayıt
- Multi-domain: {len(multi_domain):,} birleştirilmiş örnek
📚 SONRAKI BÖLÜM: İleri Teknikler
- Dataset streaming (büyük datasets için)
- Custom data collators
- Feature extraction ve transformation
- Dataset preprocessing pipelines
- Advanced filtering strategies
""")
print("\n🚀 Harika! İkinci bölümü tamamladık!")
print("Üçüncü bölüme (İleri Teknikler) geçelim mi?")