capricode-codefix / language_detector.py
pangxiang's picture
Create language_detector.py
3ae6216 verified
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Capricode 超高速多语言识别器 - 极致性能版
"""
import re
import time
import json
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple, Any
from collections import defaultdict, Counter
import os
@dataclass
class LanguageFeature:
patterns: List[Tuple[re.Pattern, float]] # 预编译正则 + 权重
keywords: Dict[str, float] # 关键词 + 权重
structural: List[Tuple[str, float]] # 结构特征
class UltraFastLanguageDetector:
"""极致性能的多语言识别器"""
def __init__(self):
self.languages = {}
self._compile_patterns()
self.usage_stats = defaultdict(int)
self.load_evolution_data()
def _compile_patterns(self):
"""预编译所有正则表达式 - 启动时一次性完成"""
# HTML
html_patterns = [
(re.compile(r'<!DOCTYPE\s+html>', re.IGNORECASE), 5.0),
(re.compile(r'<html[^>]*>', re.IGNORECASE), 4.0),
(re.compile(r'</html>', re.IGNORECASE), 3.0),
(re.compile(r'<(head|body|div|span|p|h[1-6])[^>]*>', re.IGNORECASE), 2.0),
(re.compile(r'</\w+>'), 1.5),
]
html_keywords = {'<!DOCTYPE': 5.0, '<html': 4.0, '<div': 2.0, '<span': 1.5}
# CSS
css_patterns = [
(re.compile(r'\.\w+\s*\{'), 4.0),
(re.compile(r'#\w+\s*\{'), 3.5),
(re.compile(r'@media[^{]*\{'), 3.0),
(re.compile(r'[\w-]+\s*:\s*[^;]+;'), 2.0),
]
css_keywords = {'.class': 3.0, '#id': 3.0, '@media': 3.0, 'color:': 1.5}
# JavaScript
js_patterns = [
(re.compile(r'function\s+\w+\s*\('), 4.0),
(re.compile(r'const\s+\w+\s*='), 3.0),
(re.compile(r'let\s+\w+\s*='), 3.0),
(re.compile(r'console\.log\('), 2.5),
]
js_keywords = {'function': 4.0, 'const': 3.0, 'let': 3.0, 'console.log': 2.5}
# Python
python_patterns = [
(re.compile(r'def\s+\w+\s*\('), 4.0),
(re.compile(r'class\s+\w+'), 3.5),
(re.compile(r'import\s+\w+'), 3.0),
(re.compile(r'from\s+\w+\s+import'), 3.0),
]
python_keywords = {'def': 4.0, 'class': 3.5, 'import': 3.0, 'print': 2.0}
# 更多语言定义...
self.languages = {
'html': LanguageFeature(html_patterns, html_keywords, []),
'css': LanguageFeature(css_patterns, css_keywords, []),
'javascript': LanguageFeature(js_patterns, js_keywords, []),
'python': LanguageFeature(python_patterns, python_keywords, []),
'java': LanguageFeature([
(re.compile(r'public\s+class\s+\w+'), 5.0),
(re.compile(r'public\s+static\s+void\s+main'), 4.5),
], {'public class': 5.0, 'System.out.println': 3.0}, []),
'cpp': LanguageFeature([
(re.compile(r'#include\s*<[^>]+>'), 4.5),
(re.compile(r'int\s+main\s*\('), 4.0),
], {'#include': 4.0, 'using namespace': 3.0}, []),
}
# 结构特征
self.structural_features = {
'html': [('tag_ratio', 2.0), ('attribute_ratio', 1.5)],
'css': [('brace_ratio', 2.0), ('semicolon_ratio', 1.5)],
'javascript': [('bracket_ratio', 1.5), ('function_ratio', 2.0)],
}
def extract_structural_features(self, code: str) -> Dict[str, float]:
"""提取结构特征 - 极速版本"""
lines = code.split('\n')
total_chars = len(code)
if total_chars == 0:
return {}
return {
'tag_ratio': code.count('<') / max(total_chars, 1),
'brace_ratio': (code.count('{') + code.count('}')) / max(total_chars, 1),
'bracket_ratio': (code.count('(') + code.count(')')) / max(total_chars, 1),
'semicolon_ratio': code.count(';') / max(total_chars, 1),
'line_length_var': np.var([len(line) for line in lines]) if lines else 0,
}
def detect(self, code: str, use_evolution: bool = True) -> Dict[str, Any]:
"""极速语言检测"""
start_time = time.time()
if not code or not code.strip():
return self._quick_result('text', 0.0, 'Empty code')
code = code.strip()
scores = {}
features_used = {}
# 并行特征提取
structural_features = self.extract_structural_features(code)
for lang, feature_set in self.languages.items():
score = 0.0
lang_features = []
# 1. 正则匹配
for pattern, weight in feature_set.patterns:
matches = pattern.findall(code)
if matches:
match_score = len(matches) * weight
score += match_score
lang_features.append(f"pattern:{pattern.pattern[:20]}({len(matches)})")
# 2. 关键词匹配
for keyword, weight in feature_set.keywords.items():
count = code.count(keyword)
if count > 0:
keyword_score = count * weight
score += keyword_score
lang_features.append(f"keyword:{keyword}({count})")
# 3. 结构特征
for feature_name, weight in self.structural_features.get(lang, []):
feature_value = structural_features.get(feature_name, 0)
if feature_value > 0.01: # 阈值过滤
structural_score = feature_value * weight * 100
score += structural_score
lang_features.append(f"structure:{feature_name}({structural_score:.1f})")
if score > 0:
# 进化权重调整
if use_evolution:
evolution_weight = 1.0 + (self.usage_stats[lang] * 0.1)
score *= evolution_weight
scores[lang] = score
features_used[lang] = lang_features[:3] # 只保留前3个特征
# 混合语言检测
if len(scores) > 1:
mixed_result = self._detect_mixed_language(scores, structural_features)
if mixed_result:
processing_time = (time.time() - start_time) * 1000
self.usage_stats[mixed_result['language']] += 1
self.save_evolution_data()
return {
**mixed_result,
'processing_time_ms': round(processing_time, 2),
'is_optimized': True
}
# 单语言结果
if not scores:
return self._quick_result('text', 0.0, 'No language features detected')
best_lang = max(scores.items(), key=lambda x: x[1])[0]
best_score = scores[best_lang]
total_score = sum(scores.values())
confidence = best_score / total_score if total_score > 0 else 0.0
processing_time = (time.time() - start_time) * 1000
# 记录使用统计
self.usage_stats[best_lang] += 1
self.save_evolution_data()
return {
'language': best_lang,
'confidence': round(min(confidence, 0.99), 3),
'score': round(best_score, 2),
'all_scores': {k: round(v, 2) for k, v in scores.items()},
'features': features_used.get(best_lang, []),
'processing_time_ms': round(processing_time, 2),
'is_optimized': True,
'evolution_boost': self.usage_stats[best_lang]
}
def _detect_mixed_language(self, scores: Dict[str, float], structural: Dict[str, float]) -> Dict[str, Any]:
"""混合语言检测"""
html_score = scores.get('html', 0)
css_score = scores.get('css', 0)
js_score = scores.get('javascript', 0)
# HTML + CSS/JS 混合
if html_score > 10 and (css_score > 5 or js_score > 5):
return {
'language': 'html',
'confidence': 0.85,
'is_mixed': True,
'mixed_with': ['css', 'javascript'],
'primary_language': 'html',
'embedded_languages': ['css', 'javascript'] if css_score > 5 or js_score > 5 else [],
'score': html_score + max(css_score, js_score)
}
return None
def _quick_result(self, lang: str, confidence: float, message: str) -> Dict[str, Any]:
"""快速返回结果"""
return {
'language': lang,
'confidence': confidence,
'message': message,
'processing_time_ms': 0.1,
'is_optimized': True
}
def load_evolution_data(self):
"""加载进化数据"""
try:
if os.path.exists('evolution_data.json'):
with open('evolution_data.json', 'r', encoding='utf-8') as f:
data = json.load(f)
self.usage_stats.update(data.get('usage_stats', {}))
except Exception:
self.usage_stats = defaultdict(int)
def save_evolution_data(self):
"""保存进化数据"""
try:
data = {
'usage_stats': dict(self.usage_stats),
'total_detections': sum(self.usage_stats.values()),
'last_updated': time.time()
}
with open('evolution_data.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
except Exception:
pass # 静默失败
# 全局实例
ultra_detector = UltraFastLanguageDetector()
def detect_language_ultra_fast(code: str) -> Dict[str, Any]:
"""极速语言检测接口"""
return ultra_detector.detect(code)