""" feature_extraction.py ===================== Višejezično izvlačenje značajki iz programskog koda. Podržani jezici: Python, JavaScript, TypeScript, Java, C, C++, Go, Rust, Ruby Organizirano po načinima detekcije: 1. Stilska detekcija – komentari, imenovanje, formatiranje 2. Strukturna detekcija – AST analiza, složenost, tok kontrole 3. Statistička detekcija – perplexity pomoću jezičnog modela Glavna funkcija: extract_all_features(code, language=None, filename=None) -> dict """ import re import math from collections import Counter from tree_sitter import Language, Parser from language_config import ( get_config, detect_language_from_code, detect_language_from_extension, ) # ───────────────────────────────────────────────────────────────────────────── # POMOĆNE FUNKCIJE # ───────────────────────────────────────────────────────────────────────────── def _safe_divide(a: float, b: float, default: float = 0.0) -> float: """Dijeljenje koje ne puca na nuli.""" return a / b if b != 0 else default def _get_lines(code: str) -> list: """Vraća sve linije koda kao listu stringova.""" return code.splitlines() def _build_tree(code: str, ts_module): """ Parsira kod pomoću Tree-sitter i vraća stablo. Koristi modul specifičan za jezik (npr. tree_sitter_python). Vraća None ako parsiranje ne uspije. """ try: lang = Language(ts_module.language()) parser = Parser(lang) return parser.parse(code.encode("utf-8", errors="replace")) except Exception: return None def _walk_tree(node) -> list: """ Prolazi kroz cijelo Tree-sitter stablo i vraća listu svih čvorova. Ekvivalent ast.walk() iz Pythonovog standardnog modula. """ result = [node] for child in node.children: result.extend(_walk_tree(child)) return result def _count_node_types(all_nodes: list, type_names: list) -> int: """Broji koliko se puta pojavljuje bilo koji od zadanih tipova čvorova.""" if not type_names: return 0 return sum(1 for n in all_nodes if n.type in type_names) def _get_node_depth(node, current: int = 0) -> int: """Rekurzivno računa maksimalnu dubinu stabla.""" if not node.children: return current return max(_get_node_depth(child, current + 1) for child in node.children) # ───────────────────────────────────────────────────────────────────────────── # NAČIN 1: STILSKA DETEKCIJA # ───────────────────────────────────────────────────────────────────────────── def extract_style_features(code: str, lang_config: dict) -> dict: """ Izvlači stilske značajke iz koda. Radi za sve jezike jer: - komentare prepoznaje regex obrascem iz lang_config (svaki jezik ima drugu sintaksu komentara) - identifikatore uzima iz Tree-sitter stabla (tree-sitter radi za sve podržane jezike) - formatiranje gleda direktno po linijama (potpuno universalno — vrijedi za sve jezike) Parametri: code (str): Izvorni kod kao string. lang_config (dict): Konfiguracija jezika iz language_config.py. Vraća: dict: Stilske značajke s float/int vrijednostima. """ lines = _get_lines(code) total_lines = len(lines) if lines else 1 # ── KOMENTARI ────────────────────────────────────────────────────────── # Svaki jezik ima drugačiji simbol — Python koristi #, Java/JS koriste // # Regex obrazac je definiran u language_config.py za svaki jezik. inline_pat = lang_config["inline_comment"] comment_lines = [l for l in lines if re.match(inline_pat, l)] num_comment_lines = len(comment_lines) # Prosječna duljina komentara u riječima comment_words_total = sum( len(re.sub(inline_pat, "", l).strip().split()) for l in comment_lines ) avg_comment_length_words = _safe_divide(comment_words_total, num_comment_lines) # Blok komentari: /* ... */ u Java/JS/C, =begin...=end u Ruby num_block_comments = 0 if lang_config["block_comment"]: start, end = lang_config["block_comment"] num_block_comments = len(re.findall( re.escape(start) + r"[\s\S]*?" + re.escape(end), code )) # Ukupan udio komentara u znakovima total_comment_chars = sum(len(l) for l in comment_lines) comment_to_code_ratio = _safe_divide(total_comment_chars, max(len(code), 1)) # Dokumentacijski komentari (docstring, JSDoc, Javadoc...) num_docstrings = 0 if lang_config.get("docstring_pattern"): num_docstrings = len(re.findall(lang_config["docstring_pattern"], code)) # ── IMENOVANJE (iz Tree-sitter stabla) ──────────────────────────────── # Tree-sitter za svaki jezik daje čvorove tipa "identifier" # koji sadrže nazive varijabli, funkcija, argumenata itd. identifier_names = [] function_names = [] tree = _build_tree(code, lang_config["ts_module"]) if tree: all_nodes = _walk_tree(tree.root_node) id_types = lang_config["node_types"].get("identifier", ["identifier"]) fn_types = lang_config["node_types"].get("function", []) # Skupljamo sve identifikatore for node in all_nodes: if node.type in id_types and node.text: name = node.text.decode("utf-8", errors="replace") if len(name) >= 1: identifier_names.append(name) # Skupljamo nazive funkcija — tražimo "identifier" dijete # unutar čvora koji označava funkciju for node in all_nodes: if node.type in fn_types: for child in node.children: if child.type in id_types and child.text: fn_name = child.text.decode("utf-8", errors="replace") function_names.append(fn_name) break avg_identifier_length = _safe_divide( sum(len(n) for n in identifier_names), len(identifier_names) ) avg_function_name_length = _safe_divide( sum(len(n) for n in function_names), len(function_names) ) # Jednoslovna imena (i, x, n, k...) — čovječji kod ih ima više single_char_count = sum(1 for n in identifier_names if len(n) == 1) single_char_ratio = _safe_divide(single_char_count, len(identifier_names)) # Leksička raznolikost: visoka = raznovrsni nazivi (čovjek), niska = AI ponavlja obrasce lexical_diversity = _safe_divide( len(set(identifier_names)), len(identifier_names) ) # Konvencije imenovanja def is_snake_case(n): return "_" in n and n == n.lower() and not n.startswith("_") def is_camel_case(n): return (len(n) > 1 and not n.startswith("_") and n[0].islower() and any(c.isupper() for c in n) and "_" not in n) def is_pascal_case(n): return (len(n) > 1 and n[0].isupper() and any(c.islower() for c in n) and "_" not in n) total_ids = max(len(identifier_names), 1) snake_count = sum(1 for n in identifier_names if is_snake_case(n)) camel_count = sum(1 for n in identifier_names if is_camel_case(n)) pascal_count = sum(1 for n in identifier_names if is_pascal_case(n)) snake_ratio = _safe_divide(snake_count, total_ids) camel_ratio = _safe_divide(camel_count, total_ids) # Konzistentnost imenovanja: 1.0 = svi identifikatori u istom stilu (tipično AI) naming_consistency = _safe_divide( max(snake_count, camel_count, pascal_count), total_ids ) # ── FORMATIRANJE (potpuno universalno za sve jezike) ─────────────────── non_empty_lines = [l for l in lines if l.strip()] empty_line_ratio = _safe_divide(total_lines - len(non_empty_lines), total_lines) line_lengths = [len(l) for l in non_empty_lines] if non_empty_lines else [0] avg_line_length = _safe_divide(sum(line_lengths), len(line_lengths)) max_line_length = max(line_lengths) if line_lengths else 0 # Tabovi vs razmaci za uvlačenje tab_lines = sum(1 for l in lines if l.startswith("\t")) uses_tabs = int(tab_lines > len(lines) * 0.1) # Trailing whitespace — razmaci na kraju linije trailing_ws = sum(1 for l in lines if l != l.rstrip()) trailing_ws_ratio = _safe_divide(trailing_ws, total_lines) # Konzistentnost razmaka oko operatora (= == != < >...) with_space = len(re.findall(r"\s[=!<>]=?\s", code)) without_space = len(re.findall(r"[^\s=!<>][=!<>]=[^\s=]", code)) operator_consistency = _safe_divide( max(with_space, without_space), with_space + without_space ) return { # Komentari "num_comment_lines": num_comment_lines, "comment_ratio": _safe_divide(num_comment_lines, total_lines), "avg_comment_length_words": avg_comment_length_words, "comment_to_code_ratio": comment_to_code_ratio, "num_block_comments": num_block_comments, "num_docstrings": num_docstrings, # Imenovanje "avg_identifier_length": avg_identifier_length, "avg_function_name_length": avg_function_name_length, "single_char_name_ratio": single_char_ratio, "lexical_diversity": lexical_diversity, "snake_case_ratio": snake_ratio, "camel_case_ratio": camel_ratio, "naming_consistency": naming_consistency, # Formatiranje "total_lines": total_lines, "empty_line_ratio": empty_line_ratio, "avg_line_length": avg_line_length, "max_line_length": max_line_length, "uses_tabs": uses_tabs, "trailing_whitespace_ratio": trailing_ws_ratio, "operator_spacing_consistency": operator_consistency, } # ───────────────────────────────────────────────────────────────────────────── # NAČIN 2: STRUKTURNA DETEKCIJA # ───────────────────────────────────────────────────────────────────────────── def extract_structural_features(code: str, lang_config: dict) -> dict: """ Izvlači strukturne značajke iz koda pomoću Tree-sitter AST analize. Tree-sitter radi za sve podržane jezike — jedina razlika su nazivi čvorova (npr. "function_definition" u Pythonu vs "method_declaration" u Javi), a to je riješeno kroz lang_config["node_types"]. Parametri: code (str): Izvorni kod kao string. lang_config (dict): Konfiguracija jezika iz language_config.py. Vraća: dict: Strukturne značajke, ili rječnik nula ako parsiranje ne uspije. """ empty_result = {k: 0 for k in [ "ast_depth", "ast_node_count", "unique_node_type_ratio", "num_functions", "avg_function_length", "max_function_length", "avg_args_per_function", "num_classes", "num_imports", "num_if_statements", "num_for_loops", "num_while_loops", "num_try_blocks", "num_lambdas", "max_nesting_depth", "avg_nesting_depth", "cyclomatic_complexity_approx", ]} tree = _build_tree(code, lang_config["ts_module"]) if tree is None: return empty_result root = tree.root_node all_nodes = _walk_tree(root) nt = lang_config["node_types"] # ── AST STABLO ───────────────────────────────────────────────────────── ast_depth = _get_node_depth(root) ast_node_count = len(all_nodes) node_type_counts = Counter(n.type for n in all_nodes) unique_node_type_ratio = _safe_divide(len(node_type_counts), ast_node_count) # ── FUNKCIJE ─────────────────────────────────────────────────────────── function_nodes = [n for n in all_nodes if n.type in nt.get("function", [])] num_functions = len(function_nodes) # Duljina svake funkcije u linijama koda function_lengths = [ fn.end_point[0] - fn.start_point[0] + 1 for fn in function_nodes ] avg_function_length = _safe_divide(sum(function_lengths), len(function_lengths)) max_function_length = max(function_lengths) if function_lengths else 0 # Broj parametara po funkciji args_counts = [] for fn in function_nodes: for child in fn.children: if child.type in ("parameters", "formal_parameters", "parameter_list", "argument_list"): params = [c for c in child.children if c.type not in ("(", ")", ",", "self")] args_counts.append(len(params)) break avg_args_per_function = _safe_divide(sum(args_counts), len(args_counts)) # ── KLASE I IMPORTI ──────────────────────────────────────────────────── num_classes = _count_node_types(all_nodes, nt.get("class", [])) num_imports = _count_node_types(all_nodes, nt.get("import", [])) # ── TOK KONTROLE ─────────────────────────────────────────────────────── num_if = _count_node_types(all_nodes, nt.get("if", [])) num_for = _count_node_types(all_nodes, nt.get("for", [])) num_while = _count_node_types(all_nodes, nt.get("while", [])) num_try = _count_node_types(all_nodes, nt.get("try", [])) num_lambdas = _count_node_types(all_nodes, nt.get("lambda", [])) # ── DUBINA UGNIJEŽĐENOSTI ────────────────────────────────────────────── nesting_types = set( nt.get("if", []) + nt.get("for", []) + nt.get("while", []) + nt.get("function", []) ) def collect_depths(node, depth=0): depths = [] for child in node.children: if child.type in nesting_types: depths.append(depth + 1) depths.extend(collect_depths(child, depth + 1)) else: depths.extend(collect_depths(child, depth)) return depths nesting_depths = collect_depths(root) max_nesting_depth = max(nesting_depths) if nesting_depths else 0 avg_nesting_depth = _safe_divide(sum(nesting_depths), len(nesting_depths)) # ── APROKSIMACIJA CIKLOMATSKE SLOŽENOSTI ────────────────────────────── # CC ≈ 1 + broj grananja, dijeljeno brojem funkcija # Standardna aproksimacija koja radi za sve jezike. total_branches = 1 + num_if + num_for + num_while + num_try cyclomatic_approx = ( _safe_divide(total_branches, num_functions) if num_functions > 0 else float(total_branches) ) # ── NORMALIZACIJA PO VELIČINI KODA ──────────────────────────────────── # Apsolutni brojevi (num_if, num_for...) ovise o veličini koda. # Dijeljenjem s brojem nepraznih linija dobivamo gustoću koja je # usporediva između kratkih i dugih kodova. # Npr. 3 if-a u 10 linija (0.30) vs 3 if-a u 100 linija (0.03) # — apsolutni broj je isti, ali gustoća govori pravu priču. lines_all = _get_lines(code) non_empty = max(sum(1 for l in lines_all if l.strip()), 1) if_density = _safe_divide(num_if, non_empty) for_density = _safe_divide(num_for, non_empty) while_density = _safe_divide(num_while, non_empty) try_density = _safe_divide(num_try, non_empty) function_density = _safe_divide(num_functions, non_empty) class_density = _safe_divide(num_classes, non_empty) import_density = _safe_divide(num_imports, non_empty) lambda_density = _safe_divide(num_lambdas, non_empty) # Broj AST čvorova po liniji — AI kod ima predvidljive strukture ast_nodes_per_line = _safe_divide(ast_node_count, non_empty) # ── ENTROPIJA KODA ──────────────────────────────────────────────────── # Entropija mjeri raznolikost i nepredvidljivost koda. # Visoka entropija = raznolik, nepredvidljiv kod = vjerojatno čovjek # Niska entropija = ponavljajući, predvidljiv kod = možda AI # # Formula: H = -sum(p * log2(p)) za svaki jedinstveni element import math as _math from collections import Counter as _Counter # Entropija znakova — distribucija pojedinih znakova u kodu char_counts = _Counter(code) total_chars = len(code) if code else 1 char_entropy = -sum( (c / total_chars) * _math.log2(c / total_chars) for c in char_counts.values() ) # Entropija tokena — raznolikost na razini imenskih jedinica i simbola # Bolji signal od entropije znakova jer gleda smislene jezične jedinice tokens = re.findall(r"[a-zA-Z_]\w*|[0-9]+|[^\w\s]", code) token_counts = _Counter(tokens) total_tokens = len(tokens) if tokens else 1 token_entropy = -sum( (c / total_tokens) * _math.log2(c / total_tokens) for c in token_counts.values() ) return { # AST metrike (neovisne o veličini) "ast_depth": ast_depth, "unique_node_type_ratio": unique_node_type_ratio, "ast_nodes_per_line": ast_nodes_per_line, # Funkcije (prosjeci su već neovisni o veličini) "avg_function_length": avg_function_length, "max_function_length": max_function_length, "avg_args_per_function": avg_args_per_function, # Gustoće — normalizirane po nepraznim linijama koda "function_density": function_density, "class_density": class_density, "import_density": import_density, "if_density": if_density, "for_density": for_density, "while_density": while_density, "try_density": try_density, "lambda_density": lambda_density, # Ugniježđenost i složenost (već neovisni o veličini) "max_nesting_depth": max_nesting_depth, "avg_nesting_depth": avg_nesting_depth, "cyclomatic_complexity_approx": cyclomatic_approx, # Entropija "char_entropy": char_entropy, "token_entropy": token_entropy, } # ───────────────────────────────────────────────────────────────────────────── # NAČIN 3: STATISTIČKA DETEKCIJA (Perplexity) # ───────────────────────────────────────────────────────────────────────────── def extract_statistical_features(code: str, model=None, tokenizer=None) -> dict: """ Računa perplexity koda pomoću jezičnog modela. Niski perplexity → model je "očekivao" kod → vjerojatno AI. Visoki perplexity → model je "iznenađen" → vjerojatno čovjek. Metoda je jezično-agnostična — isti model prima kod u bilo kojem jeziku. Ako model nije proslijeđen (None), vraća -1 i ostatak pipeline-a nastavlja normalno bez statističke značajke. """ if model is None or tokenizer is None: return {"perplexity": -1.0, "model_available": 0} try: import torch inputs = tokenizer( code, return_tensors="pt", truncation=True, max_length=512 ) with torch.no_grad(): outputs = model(inputs["input_ids"], labels=inputs["input_ids"]) loss = outputs.loss perplexity = math.exp(loss.item()) except Exception as e: print(f" [UPOZORENJE] Perplexity nije izračunat: {e}") perplexity = -1.0 return {"perplexity": perplexity, "model_available": 1} # ───────────────────────────────────────────────────────────────────────────── # KOMBINIRANA FUNKCIJA — ulazna točka # ───────────────────────────────────────────────────────────────────────────── def extract_all_features( code: str, language=None, filename=None, model=None, tokenizer=None, ) -> dict: """ Izvlači SVE značajke iz koda u jednom pozivu. Jezik se određuje ovim redoslijedom: 1. Argument language (ako je zadan) 2. Nastavak datoteke iz filename (ako je zadan) 3. Heuristike iz samog koda (automatska detekcija) Parametri: code (str): Izvorni kod kao string. language (str|None): Naziv jezika (npr. "python", "java"). filename (str|None): Ime datoteke (npr. "main.py"). model: (opcionalno) HuggingFace model za perplexity. tokenizer: (opcionalno) HuggingFace tokenizator. Vraća: dict: Sve značajke + ključ "detected_language". """ # Određivanje jezika if language is not None: detected_lang = language.lower() elif filename is not None: detected_lang = (detect_language_from_extension(filename) or detect_language_from_code(code)) else: detected_lang = detect_language_from_code(code) lang_config = get_config(detected_lang) # Izvlačenje značajki po metodama style_feats = extract_style_features(code, lang_config) structural_feats = extract_structural_features(code, lang_config) statistical_feats = extract_statistical_features(code, model, tokenizer) return { "detected_language": detected_lang, **style_feats, **structural_feats, **statistical_feats, } # ───────────────────────────────────────────────────────────────────────────── # BRZI TEST — pokreni: python feature_extraction.py # ───────────────────────────────────────────────────────────────────────────── if __name__ == "__main__": test_cases = { "Python (AI)": ("python", ''' def calculate_average(numbers: list) -> float: """Calculate the arithmetic mean of a list of numbers.""" if not numbers: raise ValueError("Cannot calculate average of an empty list.") total_sum = sum(numbers) count = len(numbers) return total_sum / count '''), "Python (Human)": ("python", ''' def avg(nums): # quick avg return sum(nums) / len(nums) '''), "JavaScript (AI)": ("javascript", ''' /** * Calculates the average of an array of numbers. */ function calculateAverage(numbers) { if (!numbers || numbers.length === 0) { throw new Error("Cannot calculate average of an empty array."); } const totalSum = numbers.reduce((acc, val) => acc + val, 0); return totalSum / numbers.length; } '''), "Java (AI)": ("java", ''' /** * Calculates the average of an integer array. */ public class Calculator { public static double calculateAverage(int[] numbers) { if (numbers == null || numbers.length == 0) { throw new IllegalArgumentException("Array must not be empty."); } int totalSum = 0; for (int currentNumber : numbers) { totalSum += currentNumber; } return (double) totalSum / numbers.length; } } '''), } KEY_FEATURES = [ ("Prepoznat jezik", "detected_language"), ("Omjer komentara", "comment_ratio"), ("Broj docstringova", "num_docstrings"), ("Prosj. duljina identif.", "avg_identifier_length"), ("Prosj. duljina fun. naziva", "avg_function_name_length"), ("Jednoslovna imena", "single_char_name_ratio"), ("Konzistentnost imenovanja", "naming_consistency"), ("Broj funkcija", "num_functions"), ("Prosj. duljina funkcije", "avg_function_length"), ("Max ugniježđenost", "max_nesting_depth"), ("Aproks. složenost (CC)", "cyclomatic_complexity_approx"), ("Perplexity", "perplexity"), ] for label, (lang, code) in test_cases.items(): print(f"\n{'═' * 55}") print(f" {label}") print(f"{'═' * 55}") features = extract_all_features(code, language=lang) for display_name, key in KEY_FEATURES: val = features.get(key, "N/A") if isinstance(val, float): print(f" {display_name:<32} {val:.4f}") else: print(f" {display_name:<32} {val}") print(f"\n Ukupno značajki: {len(features)}") # ───────────────────────────────────────────────────────────────────────────── # ANALIZA PO LINIJAMA — za prikaz sumnjivih linija u UI-u # ───────────────────────────────────────────────────────────────────────────── def analyze_lines(code: str, language: str = None, filename: str = None) -> list: """ Analizira kod liniju po liniju i vraća listu sumnjivih linija. Svaki element liste je rječnik: { "line": int, # broj linije (1-based) "tone": str, # "red" = jak signal, "amber" = umjeren "note": str, # kratko objašnjenje (prikazuje se u UI-u) } Detektira sljedeće AI signale po liniji: - Docstringovi i formalni blok komentari (jak signal) - Jednolinijski komentari (umjeren signal) - Linije s dugačkim identifikatorima (umjeren signal) - Type anotacije (umjeren signal) - Try/except/raise/throw s formalnim porukama (umjeren signal) - Linije s višestrukim opisnim identifikatorima (jak signal) Parametri: code (str): Izvorni kod kao string. language (str|None): Naziv jezika — ako None, automatski se detektira. filename (str|None): Ime datoteke — pomaže detekciji jezika. Vraća: list: Lista rječnika s anotacijama, sortirana po broju linije. """ import re if language is None: if filename is not None: language = detect_language_from_extension(filename) or detect_language_from_code(code) else: language = detect_language_from_code(code) try: lang_config = get_config(language) except ValueError: lang_config = get_config("python") lines = code.splitlines() annotations = [] in_docstring = False docstring_char = None inline_pat = lang_config.get("inline_comment", r"^\s*#") for i, line in enumerate(lines, start=1): stripped = line.strip() if not stripped: continue # ── DOCSTRINGOVI / BLOK KOMENTARI ───────────────────────────────── # Python docstringovi (""" ili ''') if language == "python": triple_count = stripped.count('"""') + stripped.count("'''") if not in_docstring and ('"""' in stripped or "'''" in stripped): docstring_char = '"""' if '"""' in stripped else "'''" # Ako se otvara i zatvara na istoj liniji → jednolinijski docstring if stripped.count(docstring_char) >= 2 and len(stripped) > 6: annotations.append({ "line": i, "tone": "red", "note": "Formal docstring — strong AI indicator" }) else: in_docstring = True annotations.append({ "line": i, "tone": "red", "note": "Docstring block — strong AI indicator" }) continue elif in_docstring: annotations.append({ "line": i, "tone": "red", "note": "Docstring content" }) if docstring_char and docstring_char in stripped: in_docstring = False continue # Javadoc / JSDoc blokovi (/** ... */) if language in ("java", "javascript", "typescript", "cpp", "c"): if stripped.startswith("/**") or stripped.startswith("* ") or stripped == "*/": annotations.append({ "line": i, "tone": "red", "note": "Formal documentation comment — strong AI indicator" }) continue # ── JEDNOLINIJSKI KOMENTARI ──────────────────────────────────────── if re.match(inline_pat, line): # Gledamo duljinu komentara — kratki (#) ne flagiramo comment_text = re.sub(inline_pat, "", line).strip() word_count = len(comment_text.split()) if word_count >= 4: annotations.append({ "line": i, "tone": "amber", "note": f"Inline comment ({word_count} words) — elevated comment density" }) continue # ── DUGAČKI IDENTIFIKATORI ───────────────────────────────────────── # VAŽNO: Prije analize, uklonimo sadržaj string literala s linije. # Bez ovoga, regex bi uhvatio i prirodne riječi unutar stringova # (npr. "Ucitajte red matrice") kao identifikatore — što je pogrešno. code_only = re.sub(r'"[^"]*"', '""', stripped) # ukloni "..." code_only = re.sub(r"'[^']*'", "''", code_only) # ukloni '...' code_only = re.sub(r"`[^`]*`", "``", code_only) # ukloni `...` identifiers = re.findall(r'\b([a-zA-Z_][a-zA-Z0-9_]{4,})\b', code_only) # Preskačemo rezervirane riječi i uobičajene kratke stdlib nazive reserved = { "return", "import", "function", "class", "interface", "public", "private", "static", "const", "false", "true", "none", "self", "print", "printf", "scanf", "range", "raise", "while", "break", "continue", "yield", "lambda", "assert", "except", "finally", "include", "define", "string", "vector", "struct", "unsigned", "length", "value", "write", "reads", "fopen", "fclose", "malloc", "sizeof", "stdio", "stdlib", "nullptr", "virtual", "override", "inline", "extern", "register", "volatile", "switch", "default", } real_ids = [x for x in identifiers if x.lower() not in reserved] if len(real_ids) >= 2: avg_len = sum(len(x) for x in real_ids) / len(real_ids) if avg_len >= 9: annotations.append({ "line": i, "tone": "red", "note": f"Very long identifiers (avg {avg_len:.0f} chars) — AI naming pattern" }) continue elif avg_len >= 7: annotations.append({ "line": i, "tone": "amber", "note": f"Descriptive identifier names (avg {avg_len:.0f} chars)" }) continue # ── TYPE ANOTACIJE (Python) ──────────────────────────────────────── if language == "python": if re.search(r'\)\s*->\s*\w', stripped) or re.search(r':\s*(int|float|str|bool|list|dict|tuple|set|Optional|Union|List|Dict)\b', stripped): annotations.append({ "line": i, "tone": "amber", "note": "Type annotation — uncommon in student code" }) continue # ── TRY/EXCEPT/RAISE S PORUKAMA ──────────────────────────────────── if re.match(r'^\s*(raise|throw)\s+\w*Error\s*\(', line) or \ re.match(r'^\s*(raise|throw)\s+\w*Exception\s*\(', line): annotations.append({ "line": i, "tone": "amber", "note": "Explicit exception with message — AI error handling pattern" }) continue if re.match(r'^\s*(except|catch)\s*[\(\w]', line): annotations.append({ "line": i, "tone": "amber", "note": "Exception handling — AI code often handles all edge cases" }) continue # Makni previše sumnjivih linija — ako je >60% flagirano, to gubi smisao # Prikaži samo najsumnjivije linije (max 40% koda) total_nonblank = sum(1 for l in lines if l.strip()) max_annotations = max(3, int(total_nonblank * 0.40)) # Sortiraj: red prije amber, onda po broju linije priority = {"red": 0, "amber": 1} annotations.sort(key=lambda a: (priority.get(a["tone"], 2), a["line"])) annotations = annotations[:max_annotations] annotations.sort(key=lambda a: a["line"]) return annotations