Spaces:
Sleeping
Sleeping
| """ | |
| feature_extraction.py | |
| ===================== | |
| ViΕ‘ejeziΔno izvlaΔenje znaΔajki iz programskog koda. | |
| PodrΕΎani jezici: Python, JavaScript, TypeScript, Java, C, C++, Go, Rust, Ruby | |
| Organizirano po naΔinima detekcije: | |
| 1. Stilska detekcija β komentari, imenovanje, formatiranje | |
| 2. Strukturna detekcija β AST analiza, sloΕΎenost, tok kontrole | |
| 3. StatistiΔka detekcija β perplexity pomoΔu jeziΔnog modela | |
| Glavna funkcija: | |
| extract_all_features(code, language=None, filename=None) -> dict | |
| """ | |
| import re | |
| import math | |
| from collections import Counter | |
| from tree_sitter import Language, Parser | |
| from language_config import ( | |
| get_config, | |
| detect_language_from_code, | |
| detect_language_from_extension, | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # POMOΔNE FUNKCIJE | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _safe_divide(a: float, b: float, default: float = 0.0) -> float: | |
| """Dijeljenje koje ne puca na nuli.""" | |
| return a / b if b != 0 else default | |
| def _get_lines(code: str) -> list: | |
| """VraΔa sve linije koda kao listu stringova.""" | |
| return code.splitlines() | |
| def _build_tree(code: str, ts_module): | |
| """ | |
| Parsira kod pomoΔu Tree-sitter i vraΔa stablo. | |
| Koristi modul specifiΔan za jezik (npr. tree_sitter_python). | |
| VraΔa None ako parsiranje ne uspije. | |
| """ | |
| try: | |
| lang = Language(ts_module.language()) | |
| parser = Parser(lang) | |
| return parser.parse(code.encode("utf-8", errors="replace")) | |
| except Exception: | |
| return None | |
| def _walk_tree(node) -> list: | |
| """ | |
| Prolazi kroz cijelo Tree-sitter stablo i vraΔa listu svih Δvorova. | |
| Ekvivalent ast.walk() iz Pythonovog standardnog modula. | |
| """ | |
| result = [node] | |
| for child in node.children: | |
| result.extend(_walk_tree(child)) | |
| return result | |
| def _count_node_types(all_nodes: list, type_names: list) -> int: | |
| """Broji koliko se puta pojavljuje bilo koji od zadanih tipova Δvorova.""" | |
| if not type_names: | |
| return 0 | |
| return sum(1 for n in all_nodes if n.type in type_names) | |
| def _get_node_depth(node, current: int = 0) -> int: | |
| """Rekurzivno raΔuna maksimalnu dubinu stabla.""" | |
| if not node.children: | |
| return current | |
| return max(_get_node_depth(child, current + 1) for child in node.children) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # NAΔIN 1: STILSKA DETEKCIJA | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_style_features(code: str, lang_config: dict) -> dict: | |
| """ | |
| IzvlaΔi stilske znaΔajke iz koda. | |
| Radi za sve jezike jer: | |
| - komentare prepoznaje regex obrascem iz lang_config | |
| (svaki jezik ima drugu sintaksu komentara) | |
| - identifikatore uzima iz Tree-sitter stabla | |
| (tree-sitter radi za sve podrΕΎane jezike) | |
| - formatiranje gleda direktno po linijama | |
| (potpuno universalno β vrijedi za sve jezike) | |
| Parametri: | |
| code (str): Izvorni kod kao string. | |
| lang_config (dict): Konfiguracija jezika iz language_config.py. | |
| VraΔa: | |
| dict: Stilske znaΔajke s float/int vrijednostima. | |
| """ | |
| lines = _get_lines(code) | |
| total_lines = len(lines) if lines else 1 | |
| # ββ KOMENTARI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Svaki jezik ima drugaΔiji simbol β Python koristi #, Java/JS koriste // | |
| # Regex obrazac je definiran u language_config.py za svaki jezik. | |
| inline_pat = lang_config["inline_comment"] | |
| comment_lines = [l for l in lines if re.match(inline_pat, l)] | |
| num_comment_lines = len(comment_lines) | |
| # ProsjeΔna duljina komentara u rijeΔima | |
| comment_words_total = sum( | |
| len(re.sub(inline_pat, "", l).strip().split()) | |
| for l in comment_lines | |
| ) | |
| avg_comment_length_words = _safe_divide(comment_words_total, num_comment_lines) | |
| # Blok komentari: /* ... */ u Java/JS/C, =begin...=end u Ruby | |
| num_block_comments = 0 | |
| if lang_config["block_comment"]: | |
| start, end = lang_config["block_comment"] | |
| num_block_comments = len(re.findall( | |
| re.escape(start) + r"[\s\S]*?" + re.escape(end), | |
| code | |
| )) | |
| # Ukupan udio komentara u znakovima | |
| total_comment_chars = sum(len(l) for l in comment_lines) | |
| comment_to_code_ratio = _safe_divide(total_comment_chars, max(len(code), 1)) | |
| # Dokumentacijski komentari (docstring, JSDoc, Javadoc...) | |
| num_docstrings = 0 | |
| if lang_config.get("docstring_pattern"): | |
| num_docstrings = len(re.findall(lang_config["docstring_pattern"], code)) | |
| # ββ IMENOVANJE (iz Tree-sitter stabla) ββββββββββββββββββββββββββββββββ | |
| # Tree-sitter za svaki jezik daje Δvorove tipa "identifier" | |
| # koji sadrΕΎe nazive varijabli, funkcija, argumenata itd. | |
| identifier_names = [] | |
| function_names = [] | |
| tree = _build_tree(code, lang_config["ts_module"]) | |
| if tree: | |
| all_nodes = _walk_tree(tree.root_node) | |
| id_types = lang_config["node_types"].get("identifier", ["identifier"]) | |
| fn_types = lang_config["node_types"].get("function", []) | |
| # Skupljamo sve identifikatore | |
| for node in all_nodes: | |
| if node.type in id_types and node.text: | |
| name = node.text.decode("utf-8", errors="replace") | |
| if len(name) >= 1: | |
| identifier_names.append(name) | |
| # Skupljamo nazive funkcija β traΕΎimo "identifier" dijete | |
| # unutar Δvora koji oznaΔava funkciju | |
| for node in all_nodes: | |
| if node.type in fn_types: | |
| for child in node.children: | |
| if child.type in id_types and child.text: | |
| fn_name = child.text.decode("utf-8", errors="replace") | |
| function_names.append(fn_name) | |
| break | |
| avg_identifier_length = _safe_divide( | |
| sum(len(n) for n in identifier_names), len(identifier_names) | |
| ) | |
| avg_function_name_length = _safe_divide( | |
| sum(len(n) for n in function_names), len(function_names) | |
| ) | |
| # Jednoslovna imena (i, x, n, k...) β ΔovjeΔji kod ih ima viΕ‘e | |
| single_char_count = sum(1 for n in identifier_names if len(n) == 1) | |
| single_char_ratio = _safe_divide(single_char_count, len(identifier_names)) | |
| # LeksiΔka raznolikost: visoka = raznovrsni nazivi (Δovjek), niska = AI ponavlja obrasce | |
| lexical_diversity = _safe_divide( | |
| len(set(identifier_names)), len(identifier_names) | |
| ) | |
| # Konvencije imenovanja | |
| def is_snake_case(n): | |
| return "_" in n and n == n.lower() and not n.startswith("_") | |
| def is_camel_case(n): | |
| return (len(n) > 1 and not n.startswith("_") | |
| and n[0].islower() and any(c.isupper() for c in n) | |
| and "_" not in n) | |
| def is_pascal_case(n): | |
| return (len(n) > 1 and n[0].isupper() | |
| and any(c.islower() for c in n) and "_" not in n) | |
| total_ids = max(len(identifier_names), 1) | |
| snake_count = sum(1 for n in identifier_names if is_snake_case(n)) | |
| camel_count = sum(1 for n in identifier_names if is_camel_case(n)) | |
| pascal_count = sum(1 for n in identifier_names if is_pascal_case(n)) | |
| snake_ratio = _safe_divide(snake_count, total_ids) | |
| camel_ratio = _safe_divide(camel_count, total_ids) | |
| # Konzistentnost imenovanja: 1.0 = svi identifikatori u istom stilu (tipiΔno AI) | |
| naming_consistency = _safe_divide( | |
| max(snake_count, camel_count, pascal_count), total_ids | |
| ) | |
| # ββ FORMATIRANJE (potpuno universalno za sve jezike) βββββββββββββββββββ | |
| non_empty_lines = [l for l in lines if l.strip()] | |
| empty_line_ratio = _safe_divide(total_lines - len(non_empty_lines), total_lines) | |
| line_lengths = [len(l) for l in non_empty_lines] if non_empty_lines else [0] | |
| avg_line_length = _safe_divide(sum(line_lengths), len(line_lengths)) | |
| max_line_length = max(line_lengths) if line_lengths else 0 | |
| # Tabovi vs razmaci za uvlaΔenje | |
| tab_lines = sum(1 for l in lines if l.startswith("\t")) | |
| uses_tabs = int(tab_lines > len(lines) * 0.1) | |
| # Trailing whitespace β razmaci na kraju linije | |
| trailing_ws = sum(1 for l in lines if l != l.rstrip()) | |
| trailing_ws_ratio = _safe_divide(trailing_ws, total_lines) | |
| # Konzistentnost razmaka oko operatora (= == != < >...) | |
| with_space = len(re.findall(r"\s[=!<>]=?\s", code)) | |
| without_space = len(re.findall(r"[^\s=!<>][=!<>]=[^\s=]", code)) | |
| operator_consistency = _safe_divide( | |
| max(with_space, without_space), with_space + without_space | |
| ) | |
| return { | |
| # Komentari | |
| "num_comment_lines": num_comment_lines, | |
| "comment_ratio": _safe_divide(num_comment_lines, total_lines), | |
| "avg_comment_length_words": avg_comment_length_words, | |
| "comment_to_code_ratio": comment_to_code_ratio, | |
| "num_block_comments": num_block_comments, | |
| "num_docstrings": num_docstrings, | |
| # Imenovanje | |
| "avg_identifier_length": avg_identifier_length, | |
| "avg_function_name_length": avg_function_name_length, | |
| "single_char_name_ratio": single_char_ratio, | |
| "lexical_diversity": lexical_diversity, | |
| "snake_case_ratio": snake_ratio, | |
| "camel_case_ratio": camel_ratio, | |
| "naming_consistency": naming_consistency, | |
| # Formatiranje | |
| "total_lines": total_lines, | |
| "empty_line_ratio": empty_line_ratio, | |
| "avg_line_length": avg_line_length, | |
| "max_line_length": max_line_length, | |
| "uses_tabs": uses_tabs, | |
| "trailing_whitespace_ratio": trailing_ws_ratio, | |
| "operator_spacing_consistency": operator_consistency, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # NAΔIN 2: STRUKTURNA DETEKCIJA | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_structural_features(code: str, lang_config: dict) -> dict: | |
| """ | |
| IzvlaΔi strukturne znaΔajke iz koda pomoΔu Tree-sitter AST analize. | |
| Tree-sitter radi za sve podrΕΎane jezike β jedina razlika su nazivi | |
| Δvorova (npr. "function_definition" u Pythonu vs "method_declaration" | |
| u Javi), a to je rijeΕ‘eno kroz lang_config["node_types"]. | |
| Parametri: | |
| code (str): Izvorni kod kao string. | |
| lang_config (dict): Konfiguracija jezika iz language_config.py. | |
| VraΔa: | |
| dict: Strukturne znaΔajke, ili rjeΔnik nula ako parsiranje ne uspije. | |
| """ | |
| empty_result = {k: 0 for k in [ | |
| "ast_depth", "ast_node_count", "unique_node_type_ratio", | |
| "num_functions", "avg_function_length", "max_function_length", | |
| "avg_args_per_function", "num_classes", "num_imports", | |
| "num_if_statements", "num_for_loops", "num_while_loops", | |
| "num_try_blocks", "num_lambdas", | |
| "max_nesting_depth", "avg_nesting_depth", | |
| "cyclomatic_complexity_approx", | |
| ]} | |
| tree = _build_tree(code, lang_config["ts_module"]) | |
| if tree is None: | |
| return empty_result | |
| root = tree.root_node | |
| all_nodes = _walk_tree(root) | |
| nt = lang_config["node_types"] | |
| # ββ AST STABLO βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ast_depth = _get_node_depth(root) | |
| ast_node_count = len(all_nodes) | |
| node_type_counts = Counter(n.type for n in all_nodes) | |
| unique_node_type_ratio = _safe_divide(len(node_type_counts), ast_node_count) | |
| # ββ FUNKCIJE βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function_nodes = [n for n in all_nodes if n.type in nt.get("function", [])] | |
| num_functions = len(function_nodes) | |
| # Duljina svake funkcije u linijama koda | |
| function_lengths = [ | |
| fn.end_point[0] - fn.start_point[0] + 1 | |
| for fn in function_nodes | |
| ] | |
| avg_function_length = _safe_divide(sum(function_lengths), len(function_lengths)) | |
| max_function_length = max(function_lengths) if function_lengths else 0 | |
| # Broj parametara po funkciji | |
| args_counts = [] | |
| for fn in function_nodes: | |
| for child in fn.children: | |
| if child.type in ("parameters", "formal_parameters", | |
| "parameter_list", "argument_list"): | |
| params = [c for c in child.children | |
| if c.type not in ("(", ")", ",", "self")] | |
| args_counts.append(len(params)) | |
| break | |
| avg_args_per_function = _safe_divide(sum(args_counts), len(args_counts)) | |
| # ββ KLASE I IMPORTI ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| num_classes = _count_node_types(all_nodes, nt.get("class", [])) | |
| num_imports = _count_node_types(all_nodes, nt.get("import", [])) | |
| # ββ TOK KONTROLE βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| num_if = _count_node_types(all_nodes, nt.get("if", [])) | |
| num_for = _count_node_types(all_nodes, nt.get("for", [])) | |
| num_while = _count_node_types(all_nodes, nt.get("while", [])) | |
| num_try = _count_node_types(all_nodes, nt.get("try", [])) | |
| num_lambdas = _count_node_types(all_nodes, nt.get("lambda", [])) | |
| # ββ DUBINA UGNIJEΕ½ΔENOSTI ββββββββββββββββββββββββββββββββββββββββββββββ | |
| nesting_types = set( | |
| nt.get("if", []) + nt.get("for", []) + | |
| nt.get("while", []) + nt.get("function", []) | |
| ) | |
| def collect_depths(node, depth=0): | |
| depths = [] | |
| for child in node.children: | |
| if child.type in nesting_types: | |
| depths.append(depth + 1) | |
| depths.extend(collect_depths(child, depth + 1)) | |
| else: | |
| depths.extend(collect_depths(child, depth)) | |
| return depths | |
| nesting_depths = collect_depths(root) | |
| max_nesting_depth = max(nesting_depths) if nesting_depths else 0 | |
| avg_nesting_depth = _safe_divide(sum(nesting_depths), len(nesting_depths)) | |
| # ββ APROKSIMACIJA CIKLOMATSKE SLOΕ½ENOSTI ββββββββββββββββββββββββββββββ | |
| # CC β 1 + broj grananja, dijeljeno brojem funkcija | |
| # Standardna aproksimacija koja radi za sve jezike. | |
| total_branches = 1 + num_if + num_for + num_while + num_try | |
| cyclomatic_approx = ( | |
| _safe_divide(total_branches, num_functions) | |
| if num_functions > 0 | |
| else float(total_branches) | |
| ) | |
| # ββ NORMALIZACIJA PO VELIΔINI KODA ββββββββββββββββββββββββββββββββββββ | |
| # Apsolutni brojevi (num_if, num_for...) ovise o veliΔini koda. | |
| # Dijeljenjem s brojem nepraznih linija dobivamo gustoΔu koja je | |
| # usporediva izmeΔu kratkih i dugih kodova. | |
| # Npr. 3 if-a u 10 linija (0.30) vs 3 if-a u 100 linija (0.03) | |
| # β apsolutni broj je isti, ali gustoΔa govori pravu priΔu. | |
| lines_all = _get_lines(code) | |
| non_empty = max(sum(1 for l in lines_all if l.strip()), 1) | |
| if_density = _safe_divide(num_if, non_empty) | |
| for_density = _safe_divide(num_for, non_empty) | |
| while_density = _safe_divide(num_while, non_empty) | |
| try_density = _safe_divide(num_try, non_empty) | |
| function_density = _safe_divide(num_functions, non_empty) | |
| class_density = _safe_divide(num_classes, non_empty) | |
| import_density = _safe_divide(num_imports, non_empty) | |
| lambda_density = _safe_divide(num_lambdas, non_empty) | |
| # Broj AST Δvorova po liniji β AI kod ima predvidljive strukture | |
| ast_nodes_per_line = _safe_divide(ast_node_count, non_empty) | |
| # ββ ENTROPIJA KODA ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Entropija mjeri raznolikost i nepredvidljivost koda. | |
| # Visoka entropija = raznolik, nepredvidljiv kod = vjerojatno Δovjek | |
| # Niska entropija = ponavljajuΔi, predvidljiv kod = moΕΎda AI | |
| # | |
| # Formula: H = -sum(p * log2(p)) za svaki jedinstveni element | |
| import math as _math | |
| from collections import Counter as _Counter | |
| # Entropija znakova β distribucija pojedinih znakova u kodu | |
| char_counts = _Counter(code) | |
| total_chars = len(code) if code else 1 | |
| char_entropy = -sum( | |
| (c / total_chars) * _math.log2(c / total_chars) | |
| for c in char_counts.values() | |
| ) | |
| # Entropija tokena β raznolikost na razini imenskih jedinica i simbola | |
| # Bolji signal od entropije znakova jer gleda smislene jeziΔne jedinice | |
| tokens = re.findall(r"[a-zA-Z_]\w*|[0-9]+|[^\w\s]", code) | |
| token_counts = _Counter(tokens) | |
| total_tokens = len(tokens) if tokens else 1 | |
| token_entropy = -sum( | |
| (c / total_tokens) * _math.log2(c / total_tokens) | |
| for c in token_counts.values() | |
| ) | |
| return { | |
| # AST metrike (neovisne o veliΔini) | |
| "ast_depth": ast_depth, | |
| "unique_node_type_ratio": unique_node_type_ratio, | |
| "ast_nodes_per_line": ast_nodes_per_line, | |
| # Funkcije (prosjeci su veΔ neovisni o veliΔini) | |
| "avg_function_length": avg_function_length, | |
| "max_function_length": max_function_length, | |
| "avg_args_per_function": avg_args_per_function, | |
| # GustoΔe β normalizirane po nepraznim linijama koda | |
| "function_density": function_density, | |
| "class_density": class_density, | |
| "import_density": import_density, | |
| "if_density": if_density, | |
| "for_density": for_density, | |
| "while_density": while_density, | |
| "try_density": try_density, | |
| "lambda_density": lambda_density, | |
| # UgnijeΕΎΔenost i sloΕΎenost (veΔ neovisni o veliΔini) | |
| "max_nesting_depth": max_nesting_depth, | |
| "avg_nesting_depth": avg_nesting_depth, | |
| "cyclomatic_complexity_approx": cyclomatic_approx, | |
| # Entropija | |
| "char_entropy": char_entropy, | |
| "token_entropy": token_entropy, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # NAΔIN 3: STATISTIΔKA DETEKCIJA (Perplexity) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_statistical_features(code: str, model=None, tokenizer=None) -> dict: | |
| """ | |
| RaΔuna perplexity koda pomoΔu jeziΔnog modela. | |
| Niski perplexity β model je "oΔekivao" kod β vjerojatno AI. | |
| Visoki perplexity β model je "iznenaΔen" β vjerojatno Δovjek. | |
| Metoda je jeziΔno-agnostiΔna β isti model prima kod u bilo kojem jeziku. | |
| Ako model nije proslijeΔen (None), vraΔa -1 i ostatak pipeline-a nastavlja | |
| normalno bez statistiΔke znaΔajke. | |
| """ | |
| if model is None or tokenizer is None: | |
| return {"perplexity": -1.0, "model_available": 0} | |
| try: | |
| import torch | |
| inputs = tokenizer( | |
| code, return_tensors="pt", truncation=True, max_length=512 | |
| ) | |
| with torch.no_grad(): | |
| outputs = model(inputs["input_ids"], labels=inputs["input_ids"]) | |
| loss = outputs.loss | |
| perplexity = math.exp(loss.item()) | |
| except Exception as e: | |
| print(f" [UPOZORENJE] Perplexity nije izraΔunat: {e}") | |
| perplexity = -1.0 | |
| return {"perplexity": perplexity, "model_available": 1} | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # KOMBINIRANA FUNKCIJA β ulazna toΔka | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_all_features( | |
| code: str, | |
| language=None, | |
| filename=None, | |
| model=None, | |
| tokenizer=None, | |
| ) -> dict: | |
| """ | |
| IzvlaΔi SVE znaΔajke iz koda u jednom pozivu. | |
| Jezik se odreΔuje ovim redoslijedom: | |
| 1. Argument language (ako je zadan) | |
| 2. Nastavak datoteke iz filename (ako je zadan) | |
| 3. Heuristike iz samog koda (automatska detekcija) | |
| Parametri: | |
| code (str): Izvorni kod kao string. | |
| language (str|None): Naziv jezika (npr. "python", "java"). | |
| filename (str|None): Ime datoteke (npr. "main.py"). | |
| model: (opcionalno) HuggingFace model za perplexity. | |
| tokenizer: (opcionalno) HuggingFace tokenizator. | |
| VraΔa: | |
| dict: Sve znaΔajke + kljuΔ "detected_language". | |
| """ | |
| # OdreΔivanje jezika | |
| if language is not None: | |
| detected_lang = language.lower() | |
| elif filename is not None: | |
| detected_lang = (detect_language_from_extension(filename) | |
| or detect_language_from_code(code)) | |
| else: | |
| detected_lang = detect_language_from_code(code) | |
| lang_config = get_config(detected_lang) | |
| # IzvlaΔenje znaΔajki po metodama | |
| style_feats = extract_style_features(code, lang_config) | |
| structural_feats = extract_structural_features(code, lang_config) | |
| statistical_feats = extract_statistical_features(code, model, tokenizer) | |
| return { | |
| "detected_language": detected_lang, | |
| **style_feats, | |
| **structural_feats, | |
| **statistical_feats, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # BRZI TEST β pokreni: python feature_extraction.py | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| test_cases = { | |
| "Python (AI)": ("python", ''' | |
| def calculate_average(numbers: list) -> float: | |
| """Calculate the arithmetic mean of a list of numbers.""" | |
| if not numbers: | |
| raise ValueError("Cannot calculate average of an empty list.") | |
| total_sum = sum(numbers) | |
| count = len(numbers) | |
| return total_sum / count | |
| '''), | |
| "Python (Human)": ("python", ''' | |
| def avg(nums): | |
| # quick avg | |
| return sum(nums) / len(nums) | |
| '''), | |
| "JavaScript (AI)": ("javascript", ''' | |
| /** | |
| * Calculates the average of an array of numbers. | |
| */ | |
| function calculateAverage(numbers) { | |
| if (!numbers || numbers.length === 0) { | |
| throw new Error("Cannot calculate average of an empty array."); | |
| } | |
| const totalSum = numbers.reduce((acc, val) => acc + val, 0); | |
| return totalSum / numbers.length; | |
| } | |
| '''), | |
| "Java (AI)": ("java", ''' | |
| /** | |
| * Calculates the average of an integer array. | |
| */ | |
| public class Calculator { | |
| public static double calculateAverage(int[] numbers) { | |
| if (numbers == null || numbers.length == 0) { | |
| throw new IllegalArgumentException("Array must not be empty."); | |
| } | |
| int totalSum = 0; | |
| for (int currentNumber : numbers) { | |
| totalSum += currentNumber; | |
| } | |
| return (double) totalSum / numbers.length; | |
| } | |
| } | |
| '''), | |
| } | |
| KEY_FEATURES = [ | |
| ("Prepoznat jezik", "detected_language"), | |
| ("Omjer komentara", "comment_ratio"), | |
| ("Broj docstringova", "num_docstrings"), | |
| ("Prosj. duljina identif.", "avg_identifier_length"), | |
| ("Prosj. duljina fun. naziva", "avg_function_name_length"), | |
| ("Jednoslovna imena", "single_char_name_ratio"), | |
| ("Konzistentnost imenovanja", "naming_consistency"), | |
| ("Broj funkcija", "num_functions"), | |
| ("Prosj. duljina funkcije", "avg_function_length"), | |
| ("Max ugnijeΕΎΔenost", "max_nesting_depth"), | |
| ("Aproks. sloΕΎenost (CC)", "cyclomatic_complexity_approx"), | |
| ("Perplexity", "perplexity"), | |
| ] | |
| for label, (lang, code) in test_cases.items(): | |
| print(f"\n{'β' * 55}") | |
| print(f" {label}") | |
| print(f"{'β' * 55}") | |
| features = extract_all_features(code, language=lang) | |
| for display_name, key in KEY_FEATURES: | |
| val = features.get(key, "N/A") | |
| if isinstance(val, float): | |
| print(f" {display_name:<32} {val:.4f}") | |
| else: | |
| print(f" {display_name:<32} {val}") | |
| print(f"\n Ukupno znaΔajki: {len(features)}") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ANALIZA PO LINIJAMA β za prikaz sumnjivih linija u UI-u | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def analyze_lines(code: str, language: str = None, filename: str = None) -> list: | |
| """ | |
| Analizira kod liniju po liniju i vraΔa listu sumnjivih linija. | |
| Svaki element liste je rjeΔnik: | |
| { | |
| "line": int, # broj linije (1-based) | |
| "tone": str, # "red" = jak signal, "amber" = umjeren | |
| "note": str, # kratko objaΕ‘njenje (prikazuje se u UI-u) | |
| } | |
| Detektira sljedeΔe AI signale po liniji: | |
| - Docstringovi i formalni blok komentari (jak signal) | |
| - Jednolinijski komentari (umjeren signal) | |
| - Linije s dugaΔkim identifikatorima (umjeren signal) | |
| - Type anotacije (umjeren signal) | |
| - Try/except/raise/throw s formalnim porukama (umjeren signal) | |
| - Linije s viΕ‘estrukim opisnim identifikatorima (jak signal) | |
| Parametri: | |
| code (str): Izvorni kod kao string. | |
| language (str|None): Naziv jezika β ako None, automatski se detektira. | |
| filename (str|None): Ime datoteke β pomaΕΎe detekciji jezika. | |
| VraΔa: | |
| list: Lista rjeΔnika s anotacijama, sortirana po broju linije. | |
| """ | |
| import re | |
| if language is None: | |
| if filename is not None: | |
| language = detect_language_from_extension(filename) or detect_language_from_code(code) | |
| else: | |
| language = detect_language_from_code(code) | |
| try: | |
| lang_config = get_config(language) | |
| except ValueError: | |
| lang_config = get_config("python") | |
| lines = code.splitlines() | |
| annotations = [] | |
| in_docstring = False | |
| docstring_char = None | |
| inline_pat = lang_config.get("inline_comment", r"^\s*#") | |
| for i, line in enumerate(lines, start=1): | |
| stripped = line.strip() | |
| if not stripped: | |
| continue | |
| # ββ DOCSTRINGOVI / BLOK KOMENTARI βββββββββββββββββββββββββββββββββ | |
| # Python docstringovi (""" ili ''') | |
| if language == "python": | |
| triple_count = stripped.count('"""') + stripped.count("'''") | |
| if not in_docstring and ('"""' in stripped or "'''" in stripped): | |
| docstring_char = '"""' if '"""' in stripped else "'''" | |
| # Ako se otvara i zatvara na istoj liniji β jednolinijski docstring | |
| if stripped.count(docstring_char) >= 2 and len(stripped) > 6: | |
| annotations.append({ | |
| "line": i, "tone": "red", | |
| "note": "Formal docstring β strong AI indicator" | |
| }) | |
| else: | |
| in_docstring = True | |
| annotations.append({ | |
| "line": i, "tone": "red", | |
| "note": "Docstring block β strong AI indicator" | |
| }) | |
| continue | |
| elif in_docstring: | |
| annotations.append({ | |
| "line": i, "tone": "red", | |
| "note": "Docstring content" | |
| }) | |
| if docstring_char and docstring_char in stripped: | |
| in_docstring = False | |
| continue | |
| # Javadoc / JSDoc blokovi (/** ... */) | |
| if language in ("java", "javascript", "typescript", "cpp", "c"): | |
| if stripped.startswith("/**") or stripped.startswith("* ") or stripped == "*/": | |
| annotations.append({ | |
| "line": i, "tone": "red", | |
| "note": "Formal documentation comment β strong AI indicator" | |
| }) | |
| continue | |
| # ββ JEDNOLINIJSKI KOMENTARI ββββββββββββββββββββββββββββββββββββββββ | |
| if re.match(inline_pat, line): | |
| # Gledamo duljinu komentara β kratki (#) ne flagiramo | |
| comment_text = re.sub(inline_pat, "", line).strip() | |
| word_count = len(comment_text.split()) | |
| if word_count >= 4: | |
| annotations.append({ | |
| "line": i, "tone": "amber", | |
| "note": f"Inline comment ({word_count} words) β elevated comment density" | |
| }) | |
| continue | |
| # ββ DUGAΔKI IDENTIFIKATORI βββββββββββββββββββββββββββββββββββββββββ | |
| # VAΕ½NO: Prije analize, uklonimo sadrΕΎaj string literala s linije. | |
| # Bez ovoga, regex bi uhvatio i prirodne rijeΔi unutar stringova | |
| # (npr. "Ucitajte red matrice") kao identifikatore β Ε‘to je pogreΕ‘no. | |
| code_only = re.sub(r'"[^"]*"', '""', stripped) # ukloni "..." | |
| code_only = re.sub(r"'[^']*'", "''", code_only) # ukloni '...' | |
| code_only = re.sub(r"`[^`]*`", "``", code_only) # ukloni `...` | |
| identifiers = re.findall(r'\b([a-zA-Z_][a-zA-Z0-9_]{4,})\b', code_only) | |
| # PreskaΔemo rezervirane rijeΔi i uobiΔajene kratke stdlib nazive | |
| reserved = { | |
| "return", "import", "function", "class", "interface", "public", | |
| "private", "static", "const", "false", "true", "none", "self", | |
| "print", "printf", "scanf", "range", "raise", "while", "break", | |
| "continue", "yield", "lambda", "assert", "except", "finally", | |
| "include", "define", "string", "vector", "struct", "unsigned", | |
| "length", "value", "write", "reads", "fopen", "fclose", "malloc", | |
| "sizeof", "stdio", "stdlib", "nullptr", "virtual", "override", | |
| "inline", "extern", "register", "volatile", "switch", "default", | |
| } | |
| real_ids = [x for x in identifiers if x.lower() not in reserved] | |
| if len(real_ids) >= 2: | |
| avg_len = sum(len(x) for x in real_ids) / len(real_ids) | |
| if avg_len >= 9: | |
| annotations.append({ | |
| "line": i, "tone": "red", | |
| "note": f"Very long identifiers (avg {avg_len:.0f} chars) β AI naming pattern" | |
| }) | |
| continue | |
| elif avg_len >= 7: | |
| annotations.append({ | |
| "line": i, "tone": "amber", | |
| "note": f"Descriptive identifier names (avg {avg_len:.0f} chars)" | |
| }) | |
| continue | |
| # ββ TYPE ANOTACIJE (Python) ββββββββββββββββββββββββββββββββββββββββ | |
| if language == "python": | |
| if re.search(r'\)\s*->\s*\w', stripped) or re.search(r':\s*(int|float|str|bool|list|dict|tuple|set|Optional|Union|List|Dict)\b', stripped): | |
| annotations.append({ | |
| "line": i, "tone": "amber", | |
| "note": "Type annotation β uncommon in student code" | |
| }) | |
| continue | |
| # ββ TRY/EXCEPT/RAISE S PORUKAMA ββββββββββββββββββββββββββββββββββββ | |
| if re.match(r'^\s*(raise|throw)\s+\w*Error\s*\(', line) or \ | |
| re.match(r'^\s*(raise|throw)\s+\w*Exception\s*\(', line): | |
| annotations.append({ | |
| "line": i, "tone": "amber", | |
| "note": "Explicit exception with message β AI error handling pattern" | |
| }) | |
| continue | |
| if re.match(r'^\s*(except|catch)\s*[\(\w]', line): | |
| annotations.append({ | |
| "line": i, "tone": "amber", | |
| "note": "Exception handling β AI code often handles all edge cases" | |
| }) | |
| continue | |
| # Makni previΕ‘e sumnjivih linija β ako je >60% flagirano, to gubi smisao | |
| # PrikaΕΎi samo najsumnjivije linije (max 40% koda) | |
| total_nonblank = sum(1 for l in lines if l.strip()) | |
| max_annotations = max(3, int(total_nonblank * 0.40)) | |
| # Sortiraj: red prije amber, onda po broju linije | |
| priority = {"red": 0, "amber": 1} | |
| annotations.sort(key=lambda a: (priority.get(a["tone"], 2), a["line"])) | |
| annotations = annotations[:max_annotations] | |
| annotations.sort(key=lambda a: a["line"]) | |
| return annotations | |