Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| Capricode 超高速多语言识别器 - 极致性能版 | |
| """ | |
| import re | |
| import time | |
| import json | |
| import numpy as np | |
| from dataclasses import dataclass | |
| from typing import Dict, List, Tuple, Any | |
| from collections import defaultdict, Counter | |
| import os | |
| class LanguageFeature: | |
| patterns: List[Tuple[re.Pattern, float]] # 预编译正则 + 权重 | |
| keywords: Dict[str, float] # 关键词 + 权重 | |
| structural: List[Tuple[str, float]] # 结构特征 | |
| class UltraFastLanguageDetector: | |
| """极致性能的多语言识别器""" | |
| def __init__(self): | |
| self.languages = {} | |
| self._compile_patterns() | |
| self.usage_stats = defaultdict(int) | |
| self.load_evolution_data() | |
| def _compile_patterns(self): | |
| """预编译所有正则表达式 - 启动时一次性完成""" | |
| # HTML | |
| html_patterns = [ | |
| (re.compile(r'<!DOCTYPE\s+html>', re.IGNORECASE), 5.0), | |
| (re.compile(r'<html[^>]*>', re.IGNORECASE), 4.0), | |
| (re.compile(r'</html>', re.IGNORECASE), 3.0), | |
| (re.compile(r'<(head|body|div|span|p|h[1-6])[^>]*>', re.IGNORECASE), 2.0), | |
| (re.compile(r'</\w+>'), 1.5), | |
| ] | |
| html_keywords = {'<!DOCTYPE': 5.0, '<html': 4.0, '<div': 2.0, '<span': 1.5} | |
| # CSS | |
| css_patterns = [ | |
| (re.compile(r'\.\w+\s*\{'), 4.0), | |
| (re.compile(r'#\w+\s*\{'), 3.5), | |
| (re.compile(r'@media[^{]*\{'), 3.0), | |
| (re.compile(r'[\w-]+\s*:\s*[^;]+;'), 2.0), | |
| ] | |
| css_keywords = {'.class': 3.0, '#id': 3.0, '@media': 3.0, 'color:': 1.5} | |
| # JavaScript | |
| js_patterns = [ | |
| (re.compile(r'function\s+\w+\s*\('), 4.0), | |
| (re.compile(r'const\s+\w+\s*='), 3.0), | |
| (re.compile(r'let\s+\w+\s*='), 3.0), | |
| (re.compile(r'console\.log\('), 2.5), | |
| ] | |
| js_keywords = {'function': 4.0, 'const': 3.0, 'let': 3.0, 'console.log': 2.5} | |
| # Python | |
| python_patterns = [ | |
| (re.compile(r'def\s+\w+\s*\('), 4.0), | |
| (re.compile(r'class\s+\w+'), 3.5), | |
| (re.compile(r'import\s+\w+'), 3.0), | |
| (re.compile(r'from\s+\w+\s+import'), 3.0), | |
| ] | |
| python_keywords = {'def': 4.0, 'class': 3.5, 'import': 3.0, 'print': 2.0} | |
| # 更多语言定义... | |
| self.languages = { | |
| 'html': LanguageFeature(html_patterns, html_keywords, []), | |
| 'css': LanguageFeature(css_patterns, css_keywords, []), | |
| 'javascript': LanguageFeature(js_patterns, js_keywords, []), | |
| 'python': LanguageFeature(python_patterns, python_keywords, []), | |
| 'java': LanguageFeature([ | |
| (re.compile(r'public\s+class\s+\w+'), 5.0), | |
| (re.compile(r'public\s+static\s+void\s+main'), 4.5), | |
| ], {'public class': 5.0, 'System.out.println': 3.0}, []), | |
| 'cpp': LanguageFeature([ | |
| (re.compile(r'#include\s*<[^>]+>'), 4.5), | |
| (re.compile(r'int\s+main\s*\('), 4.0), | |
| ], {'#include': 4.0, 'using namespace': 3.0}, []), | |
| } | |
| # 结构特征 | |
| self.structural_features = { | |
| 'html': [('tag_ratio', 2.0), ('attribute_ratio', 1.5)], | |
| 'css': [('brace_ratio', 2.0), ('semicolon_ratio', 1.5)], | |
| 'javascript': [('bracket_ratio', 1.5), ('function_ratio', 2.0)], | |
| } | |
| def extract_structural_features(self, code: str) -> Dict[str, float]: | |
| """提取结构特征 - 极速版本""" | |
| lines = code.split('\n') | |
| total_chars = len(code) | |
| if total_chars == 0: | |
| return {} | |
| return { | |
| 'tag_ratio': code.count('<') / max(total_chars, 1), | |
| 'brace_ratio': (code.count('{') + code.count('}')) / max(total_chars, 1), | |
| 'bracket_ratio': (code.count('(') + code.count(')')) / max(total_chars, 1), | |
| 'semicolon_ratio': code.count(';') / max(total_chars, 1), | |
| 'line_length_var': np.var([len(line) for line in lines]) if lines else 0, | |
| } | |
| def detect(self, code: str, use_evolution: bool = True) -> Dict[str, Any]: | |
| """极速语言检测""" | |
| start_time = time.time() | |
| if not code or not code.strip(): | |
| return self._quick_result('text', 0.0, 'Empty code') | |
| code = code.strip() | |
| scores = {} | |
| features_used = {} | |
| # 并行特征提取 | |
| structural_features = self.extract_structural_features(code) | |
| for lang, feature_set in self.languages.items(): | |
| score = 0.0 | |
| lang_features = [] | |
| # 1. 正则匹配 | |
| for pattern, weight in feature_set.patterns: | |
| matches = pattern.findall(code) | |
| if matches: | |
| match_score = len(matches) * weight | |
| score += match_score | |
| lang_features.append(f"pattern:{pattern.pattern[:20]}({len(matches)})") | |
| # 2. 关键词匹配 | |
| for keyword, weight in feature_set.keywords.items(): | |
| count = code.count(keyword) | |
| if count > 0: | |
| keyword_score = count * weight | |
| score += keyword_score | |
| lang_features.append(f"keyword:{keyword}({count})") | |
| # 3. 结构特征 | |
| for feature_name, weight in self.structural_features.get(lang, []): | |
| feature_value = structural_features.get(feature_name, 0) | |
| if feature_value > 0.01: # 阈值过滤 | |
| structural_score = feature_value * weight * 100 | |
| score += structural_score | |
| lang_features.append(f"structure:{feature_name}({structural_score:.1f})") | |
| if score > 0: | |
| # 进化权重调整 | |
| if use_evolution: | |
| evolution_weight = 1.0 + (self.usage_stats[lang] * 0.1) | |
| score *= evolution_weight | |
| scores[lang] = score | |
| features_used[lang] = lang_features[:3] # 只保留前3个特征 | |
| # 混合语言检测 | |
| if len(scores) > 1: | |
| mixed_result = self._detect_mixed_language(scores, structural_features) | |
| if mixed_result: | |
| processing_time = (time.time() - start_time) * 1000 | |
| self.usage_stats[mixed_result['language']] += 1 | |
| self.save_evolution_data() | |
| return { | |
| **mixed_result, | |
| 'processing_time_ms': round(processing_time, 2), | |
| 'is_optimized': True | |
| } | |
| # 单语言结果 | |
| if not scores: | |
| return self._quick_result('text', 0.0, 'No language features detected') | |
| best_lang = max(scores.items(), key=lambda x: x[1])[0] | |
| best_score = scores[best_lang] | |
| total_score = sum(scores.values()) | |
| confidence = best_score / total_score if total_score > 0 else 0.0 | |
| processing_time = (time.time() - start_time) * 1000 | |
| # 记录使用统计 | |
| self.usage_stats[best_lang] += 1 | |
| self.save_evolution_data() | |
| return { | |
| 'language': best_lang, | |
| 'confidence': round(min(confidence, 0.99), 3), | |
| 'score': round(best_score, 2), | |
| 'all_scores': {k: round(v, 2) for k, v in scores.items()}, | |
| 'features': features_used.get(best_lang, []), | |
| 'processing_time_ms': round(processing_time, 2), | |
| 'is_optimized': True, | |
| 'evolution_boost': self.usage_stats[best_lang] | |
| } | |
| def _detect_mixed_language(self, scores: Dict[str, float], structural: Dict[str, float]) -> Dict[str, Any]: | |
| """混合语言检测""" | |
| html_score = scores.get('html', 0) | |
| css_score = scores.get('css', 0) | |
| js_score = scores.get('javascript', 0) | |
| # HTML + CSS/JS 混合 | |
| if html_score > 10 and (css_score > 5 or js_score > 5): | |
| return { | |
| 'language': 'html', | |
| 'confidence': 0.85, | |
| 'is_mixed': True, | |
| 'mixed_with': ['css', 'javascript'], | |
| 'primary_language': 'html', | |
| 'embedded_languages': ['css', 'javascript'] if css_score > 5 or js_score > 5 else [], | |
| 'score': html_score + max(css_score, js_score) | |
| } | |
| return None | |
| def _quick_result(self, lang: str, confidence: float, message: str) -> Dict[str, Any]: | |
| """快速返回结果""" | |
| return { | |
| 'language': lang, | |
| 'confidence': confidence, | |
| 'message': message, | |
| 'processing_time_ms': 0.1, | |
| 'is_optimized': True | |
| } | |
| def load_evolution_data(self): | |
| """加载进化数据""" | |
| try: | |
| if os.path.exists('evolution_data.json'): | |
| with open('evolution_data.json', 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| self.usage_stats.update(data.get('usage_stats', {})) | |
| except Exception: | |
| self.usage_stats = defaultdict(int) | |
| def save_evolution_data(self): | |
| """保存进化数据""" | |
| try: | |
| data = { | |
| 'usage_stats': dict(self.usage_stats), | |
| 'total_detections': sum(self.usage_stats.values()), | |
| 'last_updated': time.time() | |
| } | |
| with open('evolution_data.json', 'w', encoding='utf-8') as f: | |
| json.dump(data, f, ensure_ascii=False, indent=2) | |
| except Exception: | |
| pass # 静默失败 | |
| # 全局实例 | |
| ultra_detector = UltraFastLanguageDetector() | |
| def detect_language_ultra_fast(code: str) -> Dict[str, Any]: | |
| """极速语言检测接口""" | |
| return ultra_detector.detect(code) | |