""" DOMAIN-SPECIFIC DATASETS - İLERİ SEVİYE HUGGING FACE ==================================================== Bu modülde öğrenecekleriniz: 1. Bilimsel Makaleler (arXiv, PubMed) - Academic datasets 2. Kod Datasets (The Stack, CodeParrot) - Programming datasets 3. Finansal Analiz Datasets - Finance & Business 4. Tıbbi/Sağlık Datasets - Medical & Healthcare 5. Domain-specific preprocessing 6. Custom tokenization 7. Domain adaptation techniques """ from datasets import Dataset, load_dataset, DatasetDict import numpy as np import json from typing import Dict, List import time from collections import Counter import re print("="*70) print("🔬 DOMAIN-SPECIFIC DATASETS - İLERİ SEVİYE") print("="*70) print("\n" + "="*70) print("1. BİLİMSEL MAKALELER - ACADEMIC DATASETS") print("="*70) # Sentetik bilimsel makale dataset'i def generate_scientific_papers(num_samples=1000): """ Bilimsel makale formatında sentetik veri """ domains = ['Physics', 'Computer Science', 'Biology', 'Mathematics', 'Chemistry'] def gen(): for i in range(num_samples): domain = np.random.choice(domains) # Makale yapısı abstract = f"This paper presents a novel approach to {domain.lower()} research. " \ f"We propose a methodology that addresses key challenges in the field. " \ f"Our experimental results show significant improvements over baseline methods. " \ f"The proposed framework demonstrates applicability across multiple scenarios." yield { 'id': f'arxiv.{i:06d}', 'title': f'Advanced Methods in {domain} Research: A Comprehensive Study {i}', 'abstract': abstract, 'authors': [f'Author {j}' for j in range(np.random.randint(2, 6))], 'domain': domain, 'year': np.random.randint(2015, 2025), 'citations': np.random.randint(0, 500), 'keywords': [f'keyword{j}' for j in range(np.random.randint(3, 8))], 'full_text': abstract + " " + abstract * np.random.randint(5, 15) } return Dataset.from_generator(gen) print("\n📚 Bilimsel Makale Dataset'i Oluşturuluyor...") scientific_dataset = generate_scientific_papers(2000) print(f"✅ {len(scientific_dataset)} bilimsel makale yüklendi") print(f"\nÖrnek makale:") sample = scientific_dataset[0] print(f" ID: {sample['id']}") print(f" Başlık: {sample['title']}") print(f" Domain: {sample['domain']}") print(f" Yazar sayısı: {len(sample['authors'])}") print(f" Yıl: {sample['year']}") print(f" Atıf sayısı: {sample['citations']}") print(f" Abstract: {sample['abstract'][:150]}...") # Domain bazlı istatistikler print("\n📊 Domain Dağılımı:") domains = [ex['domain'] for ex in scientific_dataset] domain_counts = Counter(domains) for domain, count in domain_counts.most_common(): pct = (count / len(scientific_dataset)) * 100 print(f" {domain}: {count} ({pct:.1f}%)") # Yıllara göre analiz print("\n📅 Yıllara Göre Yayın Sayısı:") years = [ex['year'] for ex in scientific_dataset] year_counts = Counter(years) for year in sorted(year_counts.keys())[-5:]: print(f" {year}: {year_counts[year]} makale") # Atıf analizi citations = [ex['citations'] for ex in scientific_dataset] print(f"\n📈 Atıf İstatistikleri:") print(f" Ortalama: {np.mean(citations):.1f}") print(f" Median: {np.median(citations):.1f}") print(f" En çok atıf: {np.max(citations)}") # Preprocessing - Bilimsel text temizleme print("\n🔧 Bilimsel Text Preprocessing:") def preprocess_scientific_text(examples): """ Bilimsel metin için özel preprocessing """ processed = [] for text in examples['abstract']: # Küçük harfe çevir text = text.lower() # Özel karakterleri temizle text = re.sub(r'[^\w\s\.]', '', text) # Fazla boşlukları temizle text = ' '.join(text.split()) processed.append(text) return { 'abstract_clean': processed, 'abstract_length': [len(t) for t in processed], 'word_count': [len(t.split()) for t in processed] } scientific_processed = scientific_dataset.map( preprocess_scientific_text, batched=True, batch_size=500, desc="Preprocessing scientific texts" ) print(f"✅ {len(scientific_processed)} makale işlendi") print(f"\nÖrnek işlenmiş abstract:") print(f" Original: {scientific_processed[0]['abstract'][:100]}...") print(f" Cleaned: {scientific_processed[0]['abstract_clean'][:100]}...") print(f" Word count: {scientific_processed[0]['word_count']}") print("\n" + "="*70) print("2. KOD DATASETS - PROGRAMMING & SOFTWARE") print("="*70) # Sentetik kod dataset'i def generate_code_dataset(num_samples=1000): """ Çeşitli programlama dilleri için kod örnekleri """ languages = ['Python', 'JavaScript', 'Java', 'C++', 'Go', 'Rust'] code_templates = { 'Python': '''def {func_name}({params}): """ {docstring} """ result = {body} return result''', 'JavaScript': '''function {func_name}({params}) {{ // {docstring} const result = {body}; return result; }}''', 'Java': '''public {return_type} {func_name}({params}) {{ // {docstring} {return_type} result = {body}; return result; }}''', } def gen(): for i in range(num_samples): lang = np.random.choice(languages) # Kod özellikleri func_name = f"process_data_{i}" params = "data, config" docstring = f"Process data using method {i}" body = "data * 2 + config" if lang in code_templates: code = code_templates[lang].format( func_name=func_name, params=params, docstring=docstring, body=body, return_type='int' if lang == 'Java' else '' ) else: code = f"// {lang} code example\n{func_name}({params})" yield { 'id': f'code_{i:06d}', 'language': lang, 'code': code, 'func_name': func_name, 'lines_of_code': len(code.split('\n')), 'has_docstring': 'docstring' in code.lower(), 'complexity': np.random.choice(['low', 'medium', 'high']), 'repo': f'github.com/user/repo_{i % 100}', 'stars': np.random.randint(0, 10000) } return Dataset.from_generator(gen) print("\n💻 Kod Dataset'i Oluşturuluyor...") code_dataset = generate_code_dataset(2000) print(f"✅ {len(code_dataset)} kod örneği yüklendi") print(f"\nÖrnek kod:") code_sample = code_dataset[0] print(f" ID: {code_sample['id']}") print(f" Dil: {code_sample['language']}") print(f" Satır sayısı: {code_sample['lines_of_code']}") print(f" Karmaşıklık: {code_sample['complexity']}") print(f"\n Kod:\n{code_sample['code']}\n") # Dil dağılımı print("\n📊 Programlama Dili Dağılımı:") languages = [ex['language'] for ex in code_dataset] lang_counts = Counter(languages) for lang, count in lang_counts.most_common(): pct = (count / len(code_dataset)) * 100 print(f" {lang}: {count} ({pct:.1f}%)") # Kod analizi print("\n📈 Kod Metrikleri:") loc_values = [ex['lines_of_code'] for ex in code_dataset] print(f" Ortalama satır sayısı: {np.mean(loc_values):.1f}") print(f" Median satır sayısı: {np.median(loc_values):.1f}") has_docstring = sum([1 for ex in code_dataset if ex['has_docstring']]) print(f" Docstring oranı: {(has_docstring/len(code_dataset)*100):.1f}%") # Kod preprocessing print("\n🔧 Kod Preprocessing:") def preprocess_code(examples): """ Kod için özel preprocessing """ def extract_functions(code): # Fonksiyon isimlerini çıkar (basit regex) funcs = re.findall(r'def\s+(\w+)|function\s+(\w+)|public\s+\w+\s+(\w+)', code) return [f for group in funcs for f in group if f] def count_comments(code): # Yorum satırlarını say return len(re.findall(r'#|//|/\*|\*/', code)) return { 'functions': [extract_functions(code) for code in examples['code']], 'comment_count': [count_comments(code) for code in examples['code']], 'code_chars': [len(code) for code in examples['code']], 'code_tokens': [len(code.split()) for code in examples['code']] } code_processed = code_dataset.map( preprocess_code, batched=True, batch_size=500, desc="Analyzing code" ) print(f"✅ {len(code_processed)} kod örneği analiz edildi") print(f"\nÖrnek analiz:") print(f" Fonksiyonlar: {code_processed[0]['functions']}") print(f" Yorum sayısı: {code_processed[0]['comment_count']}") print(f" Token sayısı: {code_processed[0]['code_tokens']}") print("\n" + "="*70) print("3. FİNANSAL ANALİZ DATASETS") print("="*70) # Sentetik finansal veri def generate_financial_dataset(num_samples=1000): """ Finansal haber ve analiz dataset'i """ companies = ['TechCorp', 'FinanceBank', 'RetailCo', 'EnergyInc', 'HealthMed'] sentiments = ['positive', 'negative', 'neutral'] categories = ['earnings', 'merger', 'product_launch', 'scandal', 'expansion'] def gen(): for i in range(num_samples): company = np.random.choice(companies) sentiment = np.random.choice(sentiments) category = np.random.choice(categories) # Finansal haber metni if sentiment == 'positive': text = f"{company} announces strong quarterly earnings, exceeding market expectations. " \ f"Stock prices surged following the announcement. Analysts remain optimistic." elif sentiment == 'negative': text = f"{company} faces challenges in the current market. " \ f"Quarterly results fell short of expectations. Investors express concern." else: text = f"{company} maintains steady performance in Q{i%4+1}. " \ f"Market reaction remains moderate. Company outlook unchanged." yield { 'id': f'fin_{i:06d}', 'company': company, 'text': text, 'sentiment': sentiment, 'category': category, 'date': f'2024-{(i%12)+1:02d}-{(i%28)+1:02d}', 'stock_change': np.random.uniform(-10, 10), 'volume': np.random.randint(1000000, 10000000), 'market_cap': np.random.uniform(1e9, 100e9), 'sector': np.random.choice(['Tech', 'Finance', 'Retail', 'Energy', 'Healthcare']) } return Dataset.from_generator(gen) print("\n💰 Finansal Dataset Oluşturuluyor...") financial_dataset = generate_financial_dataset(2000) print(f"✅ {len(financial_dataset)} finansal kayıt yüklendi") print(f"\nÖrnek finansal kayıt:") fin_sample = financial_dataset[0] print(f" ID: {fin_sample['id']}") print(f" Şirket: {fin_sample['company']}") print(f" Sentiment: {fin_sample['sentiment']}") print(f" Kategori: {fin_sample['category']}") print(f" Hisse değişimi: {fin_sample['stock_change']:.2f}%") print(f" Metin: {fin_sample['text'][:120]}...") # Sentiment analizi print("\n📊 Sentiment Dağılımı:") sentiments = [ex['sentiment'] for ex in financial_dataset] sent_counts = Counter(sentiments) for sent, count in sent_counts.items(): pct = (count / len(financial_dataset)) * 100 print(f" {sent.capitalize()}: {count} ({pct:.1f}%)") # Şirket bazlı analiz print("\n🏢 Şirket Bazlı Analiz:") companies = [ex['company'] for ex in financial_dataset] company_counts = Counter(companies) for company, count in company_counts.most_common(): avg_change = np.mean([ex['stock_change'] for ex in financial_dataset if ex['company'] == company]) print(f" {company}: {count} haber, ortalama değişim: {avg_change:+.2f}%") # Finansal preprocessing print("\n🔧 Finansal Text Preprocessing:") def preprocess_financial_text(examples): """ Finansal metin için özel preprocessing """ def extract_numbers(text): # Sayıları ve yüzdeleri çıkar numbers = re.findall(r'\d+\.?\d*%?', text) return numbers def extract_financial_terms(text): # Finansal terimleri say terms = ['earnings', 'stock', 'market', 'quarterly', 'revenue', 'profit', 'loss', 'growth', 'decline'] count = sum([1 for term in terms if term in text.lower()]) return count return { 'numbers_found': [extract_numbers(text) for text in examples['text']], 'financial_term_count': [extract_financial_terms(text) for text in examples['text']], 'text_length': [len(text) for text in examples['text']], 'has_percentage': ['%' in text for text in examples['text']] } financial_processed = financial_dataset.map( preprocess_financial_text, batched=True, batch_size=500, desc="Processing financial texts" ) print(f"✅ {len(financial_processed)} finansal kayıt işlendi") print(f"\nÖrnek analiz:") print(f" Sayılar: {financial_processed[0]['numbers_found']}") print(f" Finansal terim sayısı: {financial_processed[0]['financial_term_count']}") print(f" Yüzde var mı: {financial_processed[0]['has_percentage']}") print("\n" + "="*70) print("4. TIBBİ/SAĞLIK DATASETS") print("="*70) # Sentetik tıbbi veri def generate_medical_dataset(num_samples=1000): """ Tıbbi notlar ve tanılar """ conditions = ['Diabetes', 'Hypertension', 'Asthma', 'Arthritis', 'Migraine'] treatments = ['Medication', 'Physical Therapy', 'Surgery', 'Lifestyle Changes'] severities = ['mild', 'moderate', 'severe'] def gen(): for i in range(num_samples): condition = np.random.choice(conditions) treatment = np.random.choice(treatments) severity = np.random.choice(severities) # Tıbbi not note = f"Patient presents with {severity} {condition.lower()}. " \ f"Symptoms include relevant clinical findings. " \ f"Recommended treatment: {treatment}. " \ f"Follow-up scheduled. Patient advised on preventive measures." yield { 'id': f'med_{i:06d}', 'patient_id': f'P{i:05d}', 'condition': condition, 'severity': severity, 'treatment': treatment, 'note': note, 'age': np.random.randint(18, 90), 'gender': np.random.choice(['M', 'F']), 'visit_date': f'2024-{(i%12)+1:02d}-{(i%28)+1:02d}', 'diagnosis_confidence': np.random.uniform(0.7, 1.0), 'follow_up_required': np.random.choice([True, False]) } return Dataset.from_generator(gen) print("\n🏥 Tıbbi Dataset Oluşturuluyor...") medical_dataset = generate_medical_dataset(2000) print(f"✅ {len(medical_dataset)} tıbbi kayıt yüklendi") print(f"\nÖrnek tıbbi kayıt:") med_sample = medical_dataset[0] print(f" ID: {med_sample['id']}") print(f" Hasta ID: {med_sample['patient_id']}") print(f" Durum: {med_sample['condition']}") print(f" Şiddet: {med_sample['severity']}") print(f" Tedavi: {med_sample['treatment']}") print(f" Yaş: {med_sample['age']}") print(f" Tanı güveni: {med_sample['diagnosis_confidence']:.2f}") print(f" Not: {med_sample['note'][:100]}...") # Durum dağılımı print("\n📊 Tıbbi Durum Dağılımı:") conditions = [ex['condition'] for ex in medical_dataset] cond_counts = Counter(conditions) for cond, count in cond_counts.most_common(): pct = (count / len(medical_dataset)) * 100 print(f" {cond}: {count} ({pct:.1f}%)") # Şiddet analizi print("\n⚠️ Şiddet Dağılımı:") severities = [ex['severity'] for ex in medical_dataset] sev_counts = Counter(severities) for sev, count in sorted(sev_counts.items()): pct = (count / len(medical_dataset)) * 100 print(f" {sev.capitalize()}: {count} ({pct:.1f}%)") # Yaş grupları print("\n👥 Yaş Grubu Analizi:") ages = [ex['age'] for ex in medical_dataset] age_groups = { '18-30': sum([1 for age in ages if 18 <= age <= 30]), '31-50': sum([1 for age in ages if 31 <= age <= 50]), '51-70': sum([1 for age in ages if 51 <= age <= 70]), '71+': sum([1 for age in ages if age > 70]) } for group, count in age_groups.items(): pct = (count / len(ages)) * 100 print(f" {group}: {count} ({pct:.1f}%)") # Tıbbi preprocessing print("\n🔧 Tıbbi Text Preprocessing (PHI Removal):") def preprocess_medical_text(examples): """ Tıbbi metin için özel preprocessing PHI (Protected Health Information) temizleme simülasyonu """ def anonymize_text(text, patient_id): # Hasta ID'lerini anonimleştir text = text.replace(patient_id, '[PATIENT_ID]') # Tarihleri anonimleştir text = re.sub(r'\d{4}-\d{2}-\d{2}', '[DATE]', text) return text def extract_medical_entities(text): # Tıbbi terimleri say (basit örnek) terms = ['patient', 'symptoms', 'treatment', 'diagnosis', 'medication', 'therapy', 'condition'] count = sum([1 for term in terms if term in text.lower()]) return count return { 'note_anonymized': [ anonymize_text(note, pid) for note, pid in zip(examples['note'], examples['patient_id']) ], 'medical_entity_count': [extract_medical_entities(note) for note in examples['note']], 'note_length': [len(note) for note in examples['note']], 'requires_follow_up': examples['follow_up_required'] } medical_processed = medical_dataset.map( preprocess_medical_text, batched=True, batch_size=500, desc="Anonymizing medical records" ) print(f"✅ {len(medical_processed)} tıbbi kayıt anonimleştirildi") print(f"\nÖrnek anonimleştirilmiş not:") print(f" Orijinal: {medical_processed[0]['note'][:100]}...") print(f" Anonimleştirilmiş: {medical_processed[0]['note_anonymized'][:100]}...") print(f" Tıbbi entity sayısı: {medical_processed[0]['medical_entity_count']}") print("\n" + "="*70) print("5. DOMAIN-SPECIFIC TOKENIZATION") print("="*70) print("\n🔤 Domain-Specific Tokenization Stratejileri:") # Bilimsel metin için print("\n1️⃣ Bilimsel Metin Tokenization:") scientific_sample = scientific_dataset[0]['abstract'] print(f" Orijinal: {scientific_sample[:80]}...") # Basit word tokenization words = scientific_sample.split() print(f" Word tokens: {len(words)} kelime") print(f" İlk 5 token: {words[:5]}") # Sentence tokenization sentences = scientific_sample.split('.') print(f" Sentence tokens: {len([s for s in sentences if s.strip()])} cümle") # Kod için print("\n2️⃣ Kod Tokenization:") code_sample = code_dataset[0]['code'] print(f" Kod:\n{code_sample}") # Satır bazlı lines = code_sample.split('\n') print(f" Satır sayısı: {len(lines)}") # Token bazlı (basit) code_tokens = re.findall(r'\w+|[^\w\s]', code_sample) print(f" Token sayısı: {len(code_tokens)}") print(f" İlk 10 token: {code_tokens[:10]}") print("\n" + "="*70) print("6. CROSS-DOMAIN DATASET BİRLEŞTİRME") print("="*70) print("\n🔄 Farklı Domain'lerden Dataset Birleştirme:") # Her domain'den küçük subset al sci_subset = scientific_dataset.select(range(100)) code_subset = code_dataset.select(range(100)) fin_subset = financial_dataset.select(range(100)) # Ortak format'a çevir def normalize_scientific(example): return { 'text': example['abstract'], 'domain': 'scientific', 'metadata': { 'type': example['domain'], 'year': example['year'] } } def normalize_code(example): return { 'text': example['code'], 'domain': 'code', 'metadata': { 'language': example['language'], 'lines': example['lines_of_code'] } } def normalize_financial(example): return { 'text': example['text'], 'domain': 'financial', 'metadata': { 'sentiment': example['sentiment'], 'company': example['company'] } } print("\n📦 Dataset'leri normalize ediyoruz...") sci_norm = sci_subset.map(normalize_scientific, remove_columns=sci_subset.column_names) code_norm = code_subset.map(normalize_code, remove_columns=code_subset.column_names) fin_norm = fin_subset.map(normalize_financial, remove_columns=fin_subset.column_names) # Birleştir from datasets import concatenate_datasets multi_domain = concatenate_datasets([sci_norm, code_norm, fin_norm]) print(f"✅ Multi-domain dataset: {len(multi_domain)} örnek") print(f"\nDomain dağılımı:") domains = [ex['domain'] for ex in multi_domain] domain_dist = Counter(domains) for domain, count in domain_dist.items(): print(f" {domain}: {count}") print(f"\nÖrnek multi-domain kayıtlar:") for i in range(3): ex = multi_domain[i * 100] # Her domain'den birer örnek print(f"\n {i+1}. Domain: {ex['domain']}") print(f" Text: {ex['text'][:80]}...") print(f" Metadata: {ex['metadata']}") print("\n" + "="*70) print("7. DOMAIN ADAPTATION TEKNİKLERİ") print("="*70) print("\n🎯 Domain Adaptation Stratejileri:") # Örnek: Genel domain'den specific domain'e transfer print("\n1️⃣ Domain-Specific Vocabulary Analysis:") def analyze_domain_vocabulary(dataset, text_column, domain_name): """ Domain-specific kelime dağarcığı analizi """ all_words = [] for example in dataset: words = example[text_column].lower().split() all_words.extend(words) vocab_counts = Counter(all_words) return { 'domain': domain_name, 'total_words': len(all_words), 'unique_words': len(vocab_counts), 'top_10_words': vocab_counts.most_common(10) } # Her domain için vocabulary analizi sci_vocab = analyze_domain_vocabulary( scientific_dataset.select(range(500)), 'abstract', 'Scientific' ) code_vocab = analyze_domain_vocabulary( code_dataset.select(range(500)), 'code', 'Code' ) fin_vocab = analyze_domain_vocabulary( financial_dataset.select(range(500)), 'text', 'Financial' ) print("\n📚 Domain Vocabulary İstatistikleri:") for vocab in [sci_vocab, code_vocab, fin_vocab]: print(f"\n {vocab['domain']}:") print(f" Toplam kelime: {vocab['total_words']:,}") print(f" Benzersiz kelime: {vocab['unique_words']:,}") print(f" Vocabulary zenginliği: {vocab['unique_words']/vocab['total_words']:.3f}") print(f" Top 5 kelime: {[w for w, c in vocab['top_10_words'][:5]]}") print("\n2️⃣ Domain-Specific Data Augmentation:") def augment_scientific_text(example): """ Bilimsel metin için data augmentation """ text = example['abstract'] # Synonym replacement (basit simülasyon) augmented = text.replace('novel', 'innovative') augmented = augmented.replace('propose', 'present') augmented = augmented.replace('demonstrate', 'show') return { **example, 'abstract_augmented': augmented } print("\n Bilimsel metin augmentation örneği:") aug_sample = augment_scientific_text(scientific_dataset[0]) print(f" Original: {aug_sample['abstract'][:100]}...") print(f" Augmented: {aug_sample['abstract_augmented'][:100]}...") print("\n3️⃣ Domain-Specific Filtering:") def filter_high_quality_scientific(example): """ Yüksek kaliteli bilimsel makaleleri filtrele """ return ( example['citations'] > 50 and # Çok atıf almış example['year'] >= 2020 and # Son yıllarda yayınlanmış len(example['abstract'].split()) > 100 # Detaylı abstract ) high_quality_sci = scientific_dataset.filter( filter_high_quality_scientific, desc="Filtering high-quality papers" ) print(f"\n Kaliteli makale filtreleme:") print(f" Orijinal: {len(scientific_dataset)} makale") print(f" Filtrelenmiş: {len(high_quality_sci)} makale") print(f" Oran: {len(high_quality_sci)/len(scientific_dataset)*100:.1f}%") print("\n" + "="*70) print("8. DOMAIN-SPECIFIC EVALUATION METRİKLERİ") print("="*70) print("\n📊 Domain-Specific Kalite Metrikleri:") def calculate_domain_metrics(dataset, domain_name): """ Domain-specific kalite metrikleri """ if domain_name == 'scientific': # Bilimsel metrikler avg_citations = np.mean([ex['citations'] for ex in dataset]) avg_authors = np.mean([len(ex['authors']) for ex in dataset]) recent_papers = sum([1 for ex in dataset if ex['year'] >= 2020]) return { 'domain': domain_name, 'avg_citations': avg_citations, 'avg_authors': avg_authors, 'recent_ratio': recent_papers / len(dataset) } elif domain_name == 'code': # Kod metrikleri avg_loc = np.mean([ex['lines_of_code'] for ex in dataset]) has_doc = sum([1 for ex in dataset if ex['has_docstring']]) high_stars = sum([1 for ex in dataset if ex['stars'] > 1000]) return { 'domain': domain_name, 'avg_lines_of_code': avg_loc, 'documentation_ratio': has_doc / len(dataset), 'popular_ratio': high_stars / len(dataset) } elif domain_name == 'financial': # Finansal metrikler sentiments = [ex['sentiment'] for ex in dataset] sent_dist = Counter(sentiments) avg_change = np.mean([ex['stock_change'] for ex in dataset]) return { 'domain': domain_name, 'sentiment_distribution': dict(sent_dist), 'avg_stock_change': avg_change, 'volatility': np.std([ex['stock_change'] for ex in dataset]) } print("\n1️⃣ Scientific Metrics:") sci_metrics = calculate_domain_metrics(scientific_dataset, 'scientific') for key, value in sci_metrics.items(): print(f" {key}: {value}") print("\n2️⃣ Code Metrics:") code_metrics = calculate_domain_metrics(code_dataset, 'code') for key, value in code_metrics.items(): print(f" {key}: {value}") print("\n3️⃣ Financial Metrics:") fin_metrics = calculate_domain_metrics(financial_dataset, 'financial') for key, value in fin_metrics.items(): print(f" {key}: {value}") print("\n" + "="*70) print("9. BEST PRACTICES - DOMAIN-SPECIFIC DATASETS") print("="*70) print(""" ✅ BİLİMSEL DATASETS: - Citation metadata ekle - Abstract + full text ayrımı - Domain/field classification - Author disambiguation - Reference parsing - LaTeX formül handling ✅ KOD DATASETS: - Programlama dili ayrımı - Syntax parsing - Docstring extraction - Repository metadata - License bilgisi - Code quality metrics (complexity, coverage) ✅ FİNANSAL DATASETS: - Sentiment annotation - Entity recognition (companies, people) - Temporal information - Numerical data extraction - Market data integration - Real-time updates ✅ TIBBİ DATASETS: - PHI (Protected Health Information) removal - HIPAA compliance - Clinical terminology standardization - ICD code mapping - Anonymization - Ethical considerations ✅ GENEL PRENSİPLER: - Domain expertise gerekir - Specialized tokenization - Domain-specific validation - Quality filtering - Ethical guidelines takip et - License ve copyright kontrol et ✅ DATA QUALITY: - Domain experts ile validate et - Inter-annotator agreement hesapla - Bias analysis yap - Coverage analysis - Statistical validation - Regular updates """) print("\n" + "="*70) print("✅ BÖLÜM 2 TAMAMLANDI!") print("="*70) print(f""" Bu bölümde öğrendikleriniz: ✓ Bilimsel makale datasets ({len(scientific_dataset)} örnek) ✓ Kod datasets ({len(code_dataset)} örnek) ✓ Finansal analiz datasets ({len(financial_dataset)} örnek) ✓ Tıbbi/sağlık datasets ({len(medical_dataset)} örnek) ✓ Domain-specific preprocessing ✓ Cross-domain dataset birleştirme ✓ Domain adaptation teknikleri ✓ Domain-specific evaluation metrikleri 📊 ÜRETİLEN DATASETS: - Scientific: {len(scientific_dataset):,} makale - Code: {len(code_dataset):,} kod örneği - Financial: {len(financial_dataset):,} finansal kayıt - Medical: {len(medical_dataset):,} tıbbi kayıt - Multi-domain: {len(multi_domain):,} birleştirilmiş örnek 📚 SONRAKI BÖLÜM: İleri Teknikler - Dataset streaming (büyük datasets için) - Custom data collators - Feature extraction ve transformation - Dataset preprocessing pipelines - Advanced filtering strategies """) print("\n🚀 Harika! İkinci bölümü tamamladık!") print("Üçüncü bölüme (İleri Teknikler) geçelim mi?")