#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Capricode 超高速多语言识别器 - 极致性能版 """ import re import time import json import numpy as np from dataclasses import dataclass from typing import Dict, List, Tuple, Any from collections import defaultdict, Counter import os @dataclass class LanguageFeature: patterns: List[Tuple[re.Pattern, float]] # 预编译正则 + 权重 keywords: Dict[str, float] # 关键词 + 权重 structural: List[Tuple[str, float]] # 结构特征 class UltraFastLanguageDetector: """极致性能的多语言识别器""" def __init__(self): self.languages = {} self._compile_patterns() self.usage_stats = defaultdict(int) self.load_evolution_data() def _compile_patterns(self): """预编译所有正则表达式 - 启动时一次性完成""" # HTML html_patterns = [ (re.compile(r'', re.IGNORECASE), 5.0), (re.compile(r']*>', re.IGNORECASE), 4.0), (re.compile(r'', re.IGNORECASE), 3.0), (re.compile(r'<(head|body|div|span|p|h[1-6])[^>]*>', re.IGNORECASE), 2.0), (re.compile(r''), 1.5), ] html_keywords = {']+>'), 4.5), (re.compile(r'int\s+main\s*\('), 4.0), ], {'#include': 4.0, 'using namespace': 3.0}, []), } # 结构特征 self.structural_features = { 'html': [('tag_ratio', 2.0), ('attribute_ratio', 1.5)], 'css': [('brace_ratio', 2.0), ('semicolon_ratio', 1.5)], 'javascript': [('bracket_ratio', 1.5), ('function_ratio', 2.0)], } def extract_structural_features(self, code: str) -> Dict[str, float]: """提取结构特征 - 极速版本""" lines = code.split('\n') total_chars = len(code) if total_chars == 0: return {} return { 'tag_ratio': code.count('<') / max(total_chars, 1), 'brace_ratio': (code.count('{') + code.count('}')) / max(total_chars, 1), 'bracket_ratio': (code.count('(') + code.count(')')) / max(total_chars, 1), 'semicolon_ratio': code.count(';') / max(total_chars, 1), 'line_length_var': np.var([len(line) for line in lines]) if lines else 0, } def detect(self, code: str, use_evolution: bool = True) -> Dict[str, Any]: """极速语言检测""" start_time = time.time() if not code or not code.strip(): return self._quick_result('text', 0.0, 'Empty code') code = code.strip() scores = {} features_used = {} # 并行特征提取 structural_features = self.extract_structural_features(code) for lang, feature_set in self.languages.items(): score = 0.0 lang_features = [] # 1. 正则匹配 for pattern, weight in feature_set.patterns: matches = pattern.findall(code) if matches: match_score = len(matches) * weight score += match_score lang_features.append(f"pattern:{pattern.pattern[:20]}({len(matches)})") # 2. 关键词匹配 for keyword, weight in feature_set.keywords.items(): count = code.count(keyword) if count > 0: keyword_score = count * weight score += keyword_score lang_features.append(f"keyword:{keyword}({count})") # 3. 结构特征 for feature_name, weight in self.structural_features.get(lang, []): feature_value = structural_features.get(feature_name, 0) if feature_value > 0.01: # 阈值过滤 structural_score = feature_value * weight * 100 score += structural_score lang_features.append(f"structure:{feature_name}({structural_score:.1f})") if score > 0: # 进化权重调整 if use_evolution: evolution_weight = 1.0 + (self.usage_stats[lang] * 0.1) score *= evolution_weight scores[lang] = score features_used[lang] = lang_features[:3] # 只保留前3个特征 # 混合语言检测 if len(scores) > 1: mixed_result = self._detect_mixed_language(scores, structural_features) if mixed_result: processing_time = (time.time() - start_time) * 1000 self.usage_stats[mixed_result['language']] += 1 self.save_evolution_data() return { **mixed_result, 'processing_time_ms': round(processing_time, 2), 'is_optimized': True } # 单语言结果 if not scores: return self._quick_result('text', 0.0, 'No language features detected') best_lang = max(scores.items(), key=lambda x: x[1])[0] best_score = scores[best_lang] total_score = sum(scores.values()) confidence = best_score / total_score if total_score > 0 else 0.0 processing_time = (time.time() - start_time) * 1000 # 记录使用统计 self.usage_stats[best_lang] += 1 self.save_evolution_data() return { 'language': best_lang, 'confidence': round(min(confidence, 0.99), 3), 'score': round(best_score, 2), 'all_scores': {k: round(v, 2) for k, v in scores.items()}, 'features': features_used.get(best_lang, []), 'processing_time_ms': round(processing_time, 2), 'is_optimized': True, 'evolution_boost': self.usage_stats[best_lang] } def _detect_mixed_language(self, scores: Dict[str, float], structural: Dict[str, float]) -> Dict[str, Any]: """混合语言检测""" html_score = scores.get('html', 0) css_score = scores.get('css', 0) js_score = scores.get('javascript', 0) # HTML + CSS/JS 混合 if html_score > 10 and (css_score > 5 or js_score > 5): return { 'language': 'html', 'confidence': 0.85, 'is_mixed': True, 'mixed_with': ['css', 'javascript'], 'primary_language': 'html', 'embedded_languages': ['css', 'javascript'] if css_score > 5 or js_score > 5 else [], 'score': html_score + max(css_score, js_score) } return None def _quick_result(self, lang: str, confidence: float, message: str) -> Dict[str, Any]: """快速返回结果""" return { 'language': lang, 'confidence': confidence, 'message': message, 'processing_time_ms': 0.1, 'is_optimized': True } def load_evolution_data(self): """加载进化数据""" try: if os.path.exists('evolution_data.json'): with open('evolution_data.json', 'r', encoding='utf-8') as f: data = json.load(f) self.usage_stats.update(data.get('usage_stats', {})) except Exception: self.usage_stats = defaultdict(int) def save_evolution_data(self): """保存进化数据""" try: data = { 'usage_stats': dict(self.usage_stats), 'total_detections': sum(self.usage_stats.values()), 'last_updated': time.time() } with open('evolution_data.json', 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) except Exception: pass # 静默失败 # 全局实例 ultra_detector = UltraFastLanguageDetector() def detect_language_ultra_fast(code: str) -> Dict[str, Any]: """极速语言检测接口""" return ultra_detector.detect(code)