Spaces:
Sleeping
Sleeping
File size: 8,189 Bytes
3d015cd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | """Research/Science Domain Plugin
Scores research competency based on:
- Publication record (papers, citations)
- Lab experience and duration
- Research project depth
- Thesis/dissertation summaries
"""
import re
import time
import logging
from typing import Dict, List
from .base_plugin import BaseDomainPlugin, DomainScore
from .plugin_factory import register_plugin
logger = logging.getLogger(__name__)
@register_plugin('research')
class ResearchPlugin(BaseDomainPlugin):
"""Research/Science domain scoring plugin"""
def __init__(self):
super().__init__()
# Research indicators
self.publication_venues = [
'journal', 'conference', 'proceedings', 'ieee', 'acm',
'springer', 'elsevier', 'nature', 'science', 'arxiv'
]
self.research_methods = [
'experiment', 'methodology', 'hypothesis', 'literature review',
'data collection', 'statistical analysis', 'simulation', 'survey'
]
def _get_domain_type(self) -> str:
return 'research'
def _get_feature_weights(self) -> Dict[str, float]:
return {
'publication_score': 0.35,
'lab_experience_score': 0.25,
'research_depth_score': 0.25,
'thesis_quality_score': 0.15
}
def get_required_fields(self) -> List[str]:
return ['research_description']
def get_optional_fields(self) -> List[str]:
return ['publications_text', 'lab_experience_text', 'thesis_summary']
def score(self, evidence_data: Dict) -> DomainScore:
"""Calculate research domain score"""
start_time = time.time()
features = {}
# Publication analysis
publications = evidence_data.get('publications_text', '')
features['publication_score'] = self._analyze_publications(publications)
# Lab experience
lab_exp = evidence_data.get('lab_experience_text', '')
features['lab_experience_score'] = self._analyze_lab_experience(lab_exp)
# Research depth from main description
research_desc = evidence_data.get('research_description', '')
features['research_depth_score'] = self._analyze_research_depth(research_desc)
# Thesis quality
thesis = evidence_data.get('thesis_summary', '')
features['thesis_quality_score'] = self._analyze_thesis(thesis)
# Calculate weighted score
score = sum(features[k] * self.feature_weights[k] for k in features.keys())
# Calculate confidence
confidence = self.calculate_confidence(evidence_data)
processing_time = (time.time() - start_time) * 1000
return DomainScore(
domain_type='research',
score=min(score, 1.0),
confidence=confidence,
raw_features=features,
processing_time_ms=processing_time
)
def _analyze_publications(self, publications_text: str) -> float:
"""
Analyze publication record
Returns: 0-1 score based on number and quality of publications
"""
if not publications_text or len(publications_text) < 30:
return 0.0
text_lower = publications_text.lower()
score = 0.0
# Count publication mentions (by common patterns)
# Pattern: "Paper title" or [1] Reference format
title_patterns = [
r'"([^"]+)"', # Quoted titles
r'\[\d+\]', # Numbered references
r'\d{4}\.\s', # Year format (2023. Title...)
]
pub_count = 0
for pattern in title_patterns:
matches = re.findall(pattern, publications_text)
pub_count = max(pub_count, len(matches))
# Score based on publication count
count_score = min(pub_count / 5, 0.6) # 5+ pubs = 0.6
score += count_score
# Venue quality bonus
venue_count = sum(1 for venue in self.publication_venues if venue in text_lower)
venue_score = min(venue_count / 3, 0.4) # 3+ venues = 0.4
score += venue_score
logger.info(f"Publication score: {score:.2f} ({pub_count} pubs, {venue_count} venues)")
return min(score, 1.0)
def _analyze_lab_experience(self, lab_text: str) -> float:
"""
Analyze laboratory experience
Returns: 0-1 score based on duration and depth
"""
if not lab_text or len(lab_text) < 30:
return 0.0
text_lower = lab_text.lower()
score = 0.0
# Extract duration (months/years)
duration_patterns = [
(r'(\d+)\s*years?', 12), # Convert years to months
(r'(\d+)\s*months?', 1),
]
max_duration = 0
for pattern, multiplier in duration_patterns:
matches = re.findall(pattern, text_lower)
if matches:
duration = max([int(m) * multiplier for m in matches])
max_duration = max(max_duration, duration)
# Duration score (12 months = max)
duration_score = min(max_duration / 12, 0.5)
score += duration_score
# Lab quality indicators
quality_keywords = ['research lab', 'professor', 'phd', 'equipment', 'experiment', 'protocol']
quality_count = sum(1 for kw in quality_keywords if kw in text_lower)
quality_score = min(quality_count / 4, 0.5)
score += quality_score
logger.info(f"Lab experience: {score:.2f} ({max_duration} months)")
return min(score, 1.0)
def _analyze_research_depth(self, research_desc: str) -> float:
"""
Analyze research methodology depth
Returns: 0-1 score based on methodology sophistication
"""
if not research_desc or len(research_desc) < 50:
return 0.0
text_lower = research_desc.lower()
score = 0.0
# Research method mentions
method_count = sum(1 for method in self.research_methods if method in text_lower)
method_score = min(method_count / 4, 0.5)
score += method_score
# Technical depth indicators
technical_terms = [
'algorithm', 'model', 'framework', 'dataset', 'validation',
'baseline', 'benchmark', 'evaluation', 'metrics', 'results'
]
tech_count = sum(1 for term in technical_terms if term in text_lower)
tech_score = min(tech_count / 5, 0.3)
score += tech_score
# Length as depth proxy
length_score = min(len(research_desc) / 1000, 0.2)
score += length_score
logger.info(f"Research depth: {score:.2f}")
return min(score, 1.0)
def _analyze_thesis(self, thesis_text: str) -> float:
"""
Analyze thesis/dissertation quality
Returns: 0-1 score based on structure and depth
"""
if not thesis_text or len(thesis_text) < 100:
return 0.0
text_lower = thesis_text.lower()
score = 0.0
# Thesis structure keywords
structure_keywords = [
'abstract', 'introduction', 'methodology', 'results',
'discussion', 'conclusion', 'references', 'chapter'
]
structure_count = sum(1 for kw in structure_keywords if kw in text_lower)
structure_score = min(structure_count / 5, 0.5)
score += structure_score
# Academic rigor indicators
rigor_keywords = [
'research question', 'objective', 'contribution', 'limitation',
'future work', 'significance', 'novelty', 'finding'
]
rigor_count = sum(1 for kw in rigor_keywords if kw in text_lower)
rigor_score = min(rigor_count / 4, 0.3)
score += rigor_score
# Length bonus
length_score = min(len(thesis_text) / 2000, 0.2)
score += length_score
logger.info(f"Thesis quality: {score:.2f}")
return min(score, 1.0)
|