Spaces:

pangxiang
/

capricode-codefix

Sleeping

App Files Files Community

capricode-codefix / language_detector.py

pangxiang

Create language_detector.py

3ae6216 verified 3 months ago

raw

history blame contribute delete

10.1 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Capricode 超高速多语言识别器 - 极致性能版
	"""
	import re
	import time
	import json
	import numpy as np
	from dataclasses import dataclass
	from typing import Dict, List, Tuple, Any
	from collections import defaultdict, Counter
	import os

	@dataclass
	class LanguageFeature:
	patterns: List[Tuple[re.Pattern, float]] # 预编译正则 + 权重
	keywords: Dict[str, float] # 关键词 + 权重
	structural: List[Tuple[str, float]] # 结构特征

	class UltraFastLanguageDetector:
	"""极致性能的多语言识别器"""

	def __init__(self):
	self.languages = {}
	self._compile_patterns()
	self.usage_stats = defaultdict(int)
	self.load_evolution_data()

	def _compile_patterns(self):
	"""预编译所有正则表达式 - 启动时一次性完成"""
	# HTML
	html_patterns = [
	(re.compile(r'<!DOCTYPE\s+html>', re.IGNORECASE), 5.0),
	(re.compile(r'<html[^>]*>', re.IGNORECASE), 4.0),
	(re.compile(r'</html>', re.IGNORECASE), 3.0),
	(re.compile(r'<(head\|body\|div\|span\|p\|h[1-6])[^>]*>', re.IGNORECASE), 2.0),
	(re.compile(r'</\w+>'), 1.5),
	]
	html_keywords = {'<!DOCTYPE': 5.0, '<html': 4.0, '<div': 2.0, '<span': 1.5}

	# CSS
	css_patterns = [
	(re.compile(r'\.\w+\s*\{'), 4.0),
	(re.compile(r'#\w+\s*\{'), 3.5),
	(re.compile(r'@media[^{]*\{'), 3.0),
	(re.compile(r'[\w-]+\s:\s[^;]+;'), 2.0),
	]
	css_keywords = {'.class': 3.0, '#id': 3.0, '@media': 3.0, 'color:': 1.5}

	# JavaScript
	js_patterns = [
	(re.compile(r'function\s+\w+\s*\('), 4.0),
	(re.compile(r'const\s+\w+\s*='), 3.0),
	(re.compile(r'let\s+\w+\s*='), 3.0),
	(re.compile(r'console\.log\('), 2.5),
	]
	js_keywords = {'function': 4.0, 'const': 3.0, 'let': 3.0, 'console.log': 2.5}

	# Python
	python_patterns = [
	(re.compile(r'def\s+\w+\s*\('), 4.0),
	(re.compile(r'class\s+\w+'), 3.5),
	(re.compile(r'import\s+\w+'), 3.0),
	(re.compile(r'from\s+\w+\s+import'), 3.0),
	]
	python_keywords = {'def': 4.0, 'class': 3.5, 'import': 3.0, 'print': 2.0}

	# 更多语言定义...
	self.languages = {
	'html': LanguageFeature(html_patterns, html_keywords, []),
	'css': LanguageFeature(css_patterns, css_keywords, []),
	'javascript': LanguageFeature(js_patterns, js_keywords, []),
	'python': LanguageFeature(python_patterns, python_keywords, []),
	'java': LanguageFeature([
	(re.compile(r'public\s+class\s+\w+'), 5.0),
	(re.compile(r'public\s+static\s+void\s+main'), 4.5),
	], {'public class': 5.0, 'System.out.println': 3.0}, []),
	'cpp': LanguageFeature([
	(re.compile(r'#include\s*<[^>]+>'), 4.5),
	(re.compile(r'int\s+main\s*\('), 4.0),
	], {'#include': 4.0, 'using namespace': 3.0}, []),
	}

	# 结构特征
	self.structural_features = {
	'html': [('tag_ratio', 2.0), ('attribute_ratio', 1.5)],
	'css': [('brace_ratio', 2.0), ('semicolon_ratio', 1.5)],
	'javascript': [('bracket_ratio', 1.5), ('function_ratio', 2.0)],
	}

	def extract_structural_features(self, code: str) -> Dict[str, float]:
	"""提取结构特征 - 极速版本"""
	lines = code.split('\n')
	total_chars = len(code)
	if total_chars == 0:
	return {}

	return {
	'tag_ratio': code.count('<') / max(total_chars, 1),
	'brace_ratio': (code.count('{') + code.count('}')) / max(total_chars, 1),
	'bracket_ratio': (code.count('(') + code.count(')')) / max(total_chars, 1),
	'semicolon_ratio': code.count(';') / max(total_chars, 1),
	'line_length_var': np.var([len(line) for line in lines]) if lines else 0,
	}

	def detect(self, code: str, use_evolution: bool = True) -> Dict[str, Any]:
	"""极速语言检测"""
	start_time = time.time()

	if not code or not code.strip():
	return self._quick_result('text', 0.0, 'Empty code')

	code = code.strip()
	scores = {}
	features_used = {}

	# 并行特征提取
	structural_features = self.extract_structural_features(code)

	for lang, feature_set in self.languages.items():
	score = 0.0
	lang_features = []

	# 1. 正则匹配
	for pattern, weight in feature_set.patterns:
	matches = pattern.findall(code)
	if matches:
	match_score = len(matches) * weight
	score += match_score
	lang_features.append(f"pattern:{pattern.pattern[:20]}({len(matches)})")

	# 2. 关键词匹配
	for keyword, weight in feature_set.keywords.items():
	count = code.count(keyword)
	if count > 0:
	keyword_score = count * weight
	score += keyword_score
	lang_features.append(f"keyword:{keyword}({count})")

	# 3. 结构特征
	for feature_name, weight in self.structural_features.get(lang, []):
	feature_value = structural_features.get(feature_name, 0)
	if feature_value > 0.01: # 阈值过滤
	structural_score = feature_value * weight * 100
	score += structural_score
	lang_features.append(f"structure:{feature_name}({structural_score:.1f})")

	if score > 0:
	# 进化权重调整
	if use_evolution:
	evolution_weight = 1.0 + (self.usage_stats[lang] * 0.1)
	score *= evolution_weight

	scores[lang] = score
	features_used[lang] = lang_features[:3] # 只保留前3个特征

	# 混合语言检测
	if len(scores) > 1:
	mixed_result = self._detect_mixed_language(scores, structural_features)
	if mixed_result:
	processing_time = (time.time() - start_time) * 1000
	self.usage_stats[mixed_result['language']] += 1
	self.save_evolution_data()
	return {
	**mixed_result,
	'processing_time_ms': round(processing_time, 2),
	'is_optimized': True
	}

	# 单语言结果
	if not scores:
	return self._quick_result('text', 0.0, 'No language features detected')

	best_lang = max(scores.items(), key=lambda x: x[1])[0]
	best_score = scores[best_lang]
	total_score = sum(scores.values())
	confidence = best_score / total_score if total_score > 0 else 0.0

	processing_time = (time.time() - start_time) * 1000

	# 记录使用统计
	self.usage_stats[best_lang] += 1
	self.save_evolution_data()

	return {
	'language': best_lang,
	'confidence': round(min(confidence, 0.99), 3),
	'score': round(best_score, 2),
	'all_scores': {k: round(v, 2) for k, v in scores.items()},
	'features': features_used.get(best_lang, []),
	'processing_time_ms': round(processing_time, 2),
	'is_optimized': True,
	'evolution_boost': self.usage_stats[best_lang]
	}

	def _detect_mixed_language(self, scores: Dict[str, float], structural: Dict[str, float]) -> Dict[str, Any]:
	"""混合语言检测"""
	html_score = scores.get('html', 0)
	css_score = scores.get('css', 0)
	js_score = scores.get('javascript', 0)

	# HTML + CSS/JS 混合
	if html_score > 10 and (css_score > 5 or js_score > 5):
	return {
	'language': 'html',
	'confidence': 0.85,
	'is_mixed': True,
	'mixed_with': ['css', 'javascript'],
	'primary_language': 'html',
	'embedded_languages': ['css', 'javascript'] if css_score > 5 or js_score > 5 else [],
	'score': html_score + max(css_score, js_score)
	}

	return None

	def _quick_result(self, lang: str, confidence: float, message: str) -> Dict[str, Any]:
	"""快速返回结果"""
	return {
	'language': lang,
	'confidence': confidence,
	'message': message,
	'processing_time_ms': 0.1,
	'is_optimized': True
	}

	def load_evolution_data(self):
	"""加载进化数据"""
	try:
	if os.path.exists('evolution_data.json'):
	with open('evolution_data.json', 'r', encoding='utf-8') as f:
	data = json.load(f)
	self.usage_stats.update(data.get('usage_stats', {}))
	except Exception:
	self.usage_stats = defaultdict(int)

	def save_evolution_data(self):
	"""保存进化数据"""
	try:
	data = {
	'usage_stats': dict(self.usage_stats),
	'total_detections': sum(self.usage_stats.values()),
	'last_updated': time.time()
	}
	with open('evolution_data.json', 'w', encoding='utf-8') as f:
	json.dump(data, f, ensure_ascii=False, indent=2)
	except Exception:
	pass # 静默失败

	# 全局实例
	ultra_detector = UltraFastLanguageDetector()

	def detect_language_ultra_fast(code: str) -> Dict[str, Any]:
	"""极速语言检测接口"""
	return ultra_detector.detect(code)