ai-code-detector / feature_extraction.py
Karim Krklec
Obavljene optimizacije
a641ed5
Raw
History Blame Contribute Delete
35.6 kB
"""
feature_extraction.py
=====================
Viőejezično izvlačenje značajki iz programskog koda.
PodrΕΎani jezici: Python, JavaScript, TypeScript, Java, C, C++, Go, Rust, Ruby
Organizirano po načinima detekcije:
1. Stilska detekcija – komentari, imenovanje, formatiranje
2. Strukturna detekcija – AST analiza, sloΕΎenost, tok kontrole
3. Statistička detekcija – perplexity pomoΔ‡u jezičnog modela
Glavna funkcija:
extract_all_features(code, language=None, filename=None) -> dict
"""
import re
import math
from collections import Counter
from tree_sitter import Language, Parser
from language_config import (
get_config,
detect_language_from_code,
detect_language_from_extension,
)
# ─────────────────────────────────────────────────────────────────────────────
# POMOĆNE FUNKCIJE
# ─────────────────────────────────────────────────────────────────────────────
def _safe_divide(a: float, b: float, default: float = 0.0) -> float:
"""Dijeljenje koje ne puca na nuli."""
return a / b if b != 0 else default
def _get_lines(code: str) -> list:
"""Vraća sve linije koda kao listu stringova."""
return code.splitlines()
def _build_tree(code: str, ts_module):
"""
Parsira kod pomoću Tree-sitter i vraća stablo.
Koristi modul specifičan za jezik (npr. tree_sitter_python).
Vraća None ako parsiranje ne uspije.
"""
try:
lang = Language(ts_module.language())
parser = Parser(lang)
return parser.parse(code.encode("utf-8", errors="replace"))
except Exception:
return None
def _walk_tree(node) -> list:
"""
Prolazi kroz cijelo Tree-sitter stablo i vraΔ‡a listu svih čvorova.
Ekvivalent ast.walk() iz Pythonovog standardnog modula.
"""
result = [node]
for child in node.children:
result.extend(_walk_tree(child))
return result
def _count_node_types(all_nodes: list, type_names: list) -> int:
"""Broji koliko se puta pojavljuje bilo koji od zadanih tipova čvorova."""
if not type_names:
return 0
return sum(1 for n in all_nodes if n.type in type_names)
def _get_node_depth(node, current: int = 0) -> int:
"""Rekurzivno računa maksimalnu dubinu stabla."""
if not node.children:
return current
return max(_get_node_depth(child, current + 1) for child in node.children)
# ─────────────────────────────────────────────────────────────────────────────
# NAČIN 1: STILSKA DETEKCIJA
# ─────────────────────────────────────────────────────────────────────────────
def extract_style_features(code: str, lang_config: dict) -> dict:
"""
Izvlači stilske značajke iz koda.
Radi za sve jezike jer:
- komentare prepoznaje regex obrascem iz lang_config
(svaki jezik ima drugu sintaksu komentara)
- identifikatore uzima iz Tree-sitter stabla
(tree-sitter radi za sve podrΕΎane jezike)
- formatiranje gleda direktno po linijama
(potpuno universalno β€” vrijedi za sve jezike)
Parametri:
code (str): Izvorni kod kao string.
lang_config (dict): Konfiguracija jezika iz language_config.py.
Vraća:
dict: Stilske značajke s float/int vrijednostima.
"""
lines = _get_lines(code)
total_lines = len(lines) if lines else 1
# ── KOMENTARI ──────────────────────────────────────────────────────────
# Svaki jezik ima drugačiji simbol β€” Python koristi #, Java/JS koriste //
# Regex obrazac je definiran u language_config.py za svaki jezik.
inline_pat = lang_config["inline_comment"]
comment_lines = [l for l in lines if re.match(inline_pat, l)]
num_comment_lines = len(comment_lines)
# Prosječna duljina komentara u riječima
comment_words_total = sum(
len(re.sub(inline_pat, "", l).strip().split())
for l in comment_lines
)
avg_comment_length_words = _safe_divide(comment_words_total, num_comment_lines)
# Blok komentari: /* ... */ u Java/JS/C, =begin...=end u Ruby
num_block_comments = 0
if lang_config["block_comment"]:
start, end = lang_config["block_comment"]
num_block_comments = len(re.findall(
re.escape(start) + r"[\s\S]*?" + re.escape(end),
code
))
# Ukupan udio komentara u znakovima
total_comment_chars = sum(len(l) for l in comment_lines)
comment_to_code_ratio = _safe_divide(total_comment_chars, max(len(code), 1))
# Dokumentacijski komentari (docstring, JSDoc, Javadoc...)
num_docstrings = 0
if lang_config.get("docstring_pattern"):
num_docstrings = len(re.findall(lang_config["docstring_pattern"], code))
# ── IMENOVANJE (iz Tree-sitter stabla) ────────────────────────────────
# Tree-sitter za svaki jezik daje čvorove tipa "identifier"
# koji sadrΕΎe nazive varijabli, funkcija, argumenata itd.
identifier_names = []
function_names = []
tree = _build_tree(code, lang_config["ts_module"])
if tree:
all_nodes = _walk_tree(tree.root_node)
id_types = lang_config["node_types"].get("identifier", ["identifier"])
fn_types = lang_config["node_types"].get("function", [])
# Skupljamo sve identifikatore
for node in all_nodes:
if node.type in id_types and node.text:
name = node.text.decode("utf-8", errors="replace")
if len(name) >= 1:
identifier_names.append(name)
# Skupljamo nazive funkcija β€” traΕΎimo "identifier" dijete
# unutar čvora koji označava funkciju
for node in all_nodes:
if node.type in fn_types:
for child in node.children:
if child.type in id_types and child.text:
fn_name = child.text.decode("utf-8", errors="replace")
function_names.append(fn_name)
break
avg_identifier_length = _safe_divide(
sum(len(n) for n in identifier_names), len(identifier_names)
)
avg_function_name_length = _safe_divide(
sum(len(n) for n in function_names), len(function_names)
)
# Jednoslovna imena (i, x, n, k...) β€” čovječji kod ih ima viΕ‘e
single_char_count = sum(1 for n in identifier_names if len(n) == 1)
single_char_ratio = _safe_divide(single_char_count, len(identifier_names))
# Leksička raznolikost: visoka = raznovrsni nazivi (čovjek), niska = AI ponavlja obrasce
lexical_diversity = _safe_divide(
len(set(identifier_names)), len(identifier_names)
)
# Konvencije imenovanja
def is_snake_case(n):
return "_" in n and n == n.lower() and not n.startswith("_")
def is_camel_case(n):
return (len(n) > 1 and not n.startswith("_")
and n[0].islower() and any(c.isupper() for c in n)
and "_" not in n)
def is_pascal_case(n):
return (len(n) > 1 and n[0].isupper()
and any(c.islower() for c in n) and "_" not in n)
total_ids = max(len(identifier_names), 1)
snake_count = sum(1 for n in identifier_names if is_snake_case(n))
camel_count = sum(1 for n in identifier_names if is_camel_case(n))
pascal_count = sum(1 for n in identifier_names if is_pascal_case(n))
snake_ratio = _safe_divide(snake_count, total_ids)
camel_ratio = _safe_divide(camel_count, total_ids)
# Konzistentnost imenovanja: 1.0 = svi identifikatori u istom stilu (tipično AI)
naming_consistency = _safe_divide(
max(snake_count, camel_count, pascal_count), total_ids
)
# ── FORMATIRANJE (potpuno universalno za sve jezike) ───────────────────
non_empty_lines = [l for l in lines if l.strip()]
empty_line_ratio = _safe_divide(total_lines - len(non_empty_lines), total_lines)
line_lengths = [len(l) for l in non_empty_lines] if non_empty_lines else [0]
avg_line_length = _safe_divide(sum(line_lengths), len(line_lengths))
max_line_length = max(line_lengths) if line_lengths else 0
# Tabovi vs razmaci za uvlačenje
tab_lines = sum(1 for l in lines if l.startswith("\t"))
uses_tabs = int(tab_lines > len(lines) * 0.1)
# Trailing whitespace β€” razmaci na kraju linije
trailing_ws = sum(1 for l in lines if l != l.rstrip())
trailing_ws_ratio = _safe_divide(trailing_ws, total_lines)
# Konzistentnost razmaka oko operatora (= == != < >...)
with_space = len(re.findall(r"\s[=!<>]=?\s", code))
without_space = len(re.findall(r"[^\s=!<>][=!<>]=[^\s=]", code))
operator_consistency = _safe_divide(
max(with_space, without_space), with_space + without_space
)
return {
# Komentari
"num_comment_lines": num_comment_lines,
"comment_ratio": _safe_divide(num_comment_lines, total_lines),
"avg_comment_length_words": avg_comment_length_words,
"comment_to_code_ratio": comment_to_code_ratio,
"num_block_comments": num_block_comments,
"num_docstrings": num_docstrings,
# Imenovanje
"avg_identifier_length": avg_identifier_length,
"avg_function_name_length": avg_function_name_length,
"single_char_name_ratio": single_char_ratio,
"lexical_diversity": lexical_diversity,
"snake_case_ratio": snake_ratio,
"camel_case_ratio": camel_ratio,
"naming_consistency": naming_consistency,
# Formatiranje
"total_lines": total_lines,
"empty_line_ratio": empty_line_ratio,
"avg_line_length": avg_line_length,
"max_line_length": max_line_length,
"uses_tabs": uses_tabs,
"trailing_whitespace_ratio": trailing_ws_ratio,
"operator_spacing_consistency": operator_consistency,
}
# ─────────────────────────────────────────────────────────────────────────────
# NAČIN 2: STRUKTURNA DETEKCIJA
# ─────────────────────────────────────────────────────────────────────────────
def extract_structural_features(code: str, lang_config: dict) -> dict:
"""
Izvlači strukturne značajke iz koda pomoΔ‡u Tree-sitter AST analize.
Tree-sitter radi za sve podrΕΎane jezike β€” jedina razlika su nazivi
čvorova (npr. "function_definition" u Pythonu vs "method_declaration"
u Javi), a to je rijeΕ‘eno kroz lang_config["node_types"].
Parametri:
code (str): Izvorni kod kao string.
lang_config (dict): Konfiguracija jezika iz language_config.py.
Vraća:
dict: Strukturne značajke, ili rječnik nula ako parsiranje ne uspije.
"""
empty_result = {k: 0 for k in [
"ast_depth", "ast_node_count", "unique_node_type_ratio",
"num_functions", "avg_function_length", "max_function_length",
"avg_args_per_function", "num_classes", "num_imports",
"num_if_statements", "num_for_loops", "num_while_loops",
"num_try_blocks", "num_lambdas",
"max_nesting_depth", "avg_nesting_depth",
"cyclomatic_complexity_approx",
]}
tree = _build_tree(code, lang_config["ts_module"])
if tree is None:
return empty_result
root = tree.root_node
all_nodes = _walk_tree(root)
nt = lang_config["node_types"]
# ── AST STABLO ─────────────────────────────────────────────────────────
ast_depth = _get_node_depth(root)
ast_node_count = len(all_nodes)
node_type_counts = Counter(n.type for n in all_nodes)
unique_node_type_ratio = _safe_divide(len(node_type_counts), ast_node_count)
# ── FUNKCIJE ───────────────────────────────────────────────────────────
function_nodes = [n for n in all_nodes if n.type in nt.get("function", [])]
num_functions = len(function_nodes)
# Duljina svake funkcije u linijama koda
function_lengths = [
fn.end_point[0] - fn.start_point[0] + 1
for fn in function_nodes
]
avg_function_length = _safe_divide(sum(function_lengths), len(function_lengths))
max_function_length = max(function_lengths) if function_lengths else 0
# Broj parametara po funkciji
args_counts = []
for fn in function_nodes:
for child in fn.children:
if child.type in ("parameters", "formal_parameters",
"parameter_list", "argument_list"):
params = [c for c in child.children
if c.type not in ("(", ")", ",", "self")]
args_counts.append(len(params))
break
avg_args_per_function = _safe_divide(sum(args_counts), len(args_counts))
# ── KLASE I IMPORTI ────────────────────────────────────────────────────
num_classes = _count_node_types(all_nodes, nt.get("class", []))
num_imports = _count_node_types(all_nodes, nt.get("import", []))
# ── TOK KONTROLE ───────────────────────────────────────────────────────
num_if = _count_node_types(all_nodes, nt.get("if", []))
num_for = _count_node_types(all_nodes, nt.get("for", []))
num_while = _count_node_types(all_nodes, nt.get("while", []))
num_try = _count_node_types(all_nodes, nt.get("try", []))
num_lambdas = _count_node_types(all_nodes, nt.get("lambda", []))
# ── DUBINA UGNIJEŽĐENOSTI ──────────────────────────────────────────────
nesting_types = set(
nt.get("if", []) + nt.get("for", []) +
nt.get("while", []) + nt.get("function", [])
)
def collect_depths(node, depth=0):
depths = []
for child in node.children:
if child.type in nesting_types:
depths.append(depth + 1)
depths.extend(collect_depths(child, depth + 1))
else:
depths.extend(collect_depths(child, depth))
return depths
nesting_depths = collect_depths(root)
max_nesting_depth = max(nesting_depths) if nesting_depths else 0
avg_nesting_depth = _safe_divide(sum(nesting_depths), len(nesting_depths))
# ── APROKSIMACIJA CIKLOMATSKE SLOΕ½ENOSTI ──────────────────────────────
# CC β‰ˆ 1 + broj grananja, dijeljeno brojem funkcija
# Standardna aproksimacija koja radi za sve jezike.
total_branches = 1 + num_if + num_for + num_while + num_try
cyclomatic_approx = (
_safe_divide(total_branches, num_functions)
if num_functions > 0
else float(total_branches)
)
# ── NORMALIZACIJA PO VELIČINI KODA ────────────────────────────────────
# Apsolutni brojevi (num_if, num_for...) ovise o veličini koda.
# Dijeljenjem s brojem nepraznih linija dobivamo gustoću koja je
# usporediva izmeΔ‘u kratkih i dugih kodova.
# Npr. 3 if-a u 10 linija (0.30) vs 3 if-a u 100 linija (0.03)
# β€” apsolutni broj je isti, ali gustoΔ‡a govori pravu priču.
lines_all = _get_lines(code)
non_empty = max(sum(1 for l in lines_all if l.strip()), 1)
if_density = _safe_divide(num_if, non_empty)
for_density = _safe_divide(num_for, non_empty)
while_density = _safe_divide(num_while, non_empty)
try_density = _safe_divide(num_try, non_empty)
function_density = _safe_divide(num_functions, non_empty)
class_density = _safe_divide(num_classes, non_empty)
import_density = _safe_divide(num_imports, non_empty)
lambda_density = _safe_divide(num_lambdas, non_empty)
# Broj AST čvorova po liniji β€” AI kod ima predvidljive strukture
ast_nodes_per_line = _safe_divide(ast_node_count, non_empty)
# ── ENTROPIJA KODA ────────────────────────────────────────────────────
# Entropija mjeri raznolikost i nepredvidljivost koda.
# Visoka entropija = raznolik, nepredvidljiv kod = vjerojatno čovjek
# Niska entropija = ponavljajući, predvidljiv kod = možda AI
#
# Formula: H = -sum(p * log2(p)) za svaki jedinstveni element
import math as _math
from collections import Counter as _Counter
# Entropija znakova β€” distribucija pojedinih znakova u kodu
char_counts = _Counter(code)
total_chars = len(code) if code else 1
char_entropy = -sum(
(c / total_chars) * _math.log2(c / total_chars)
for c in char_counts.values()
)
# Entropija tokena β€” raznolikost na razini imenskih jedinica i simbola
# Bolji signal od entropije znakova jer gleda smislene jezične jedinice
tokens = re.findall(r"[a-zA-Z_]\w*|[0-9]+|[^\w\s]", code)
token_counts = _Counter(tokens)
total_tokens = len(tokens) if tokens else 1
token_entropy = -sum(
(c / total_tokens) * _math.log2(c / total_tokens)
for c in token_counts.values()
)
return {
# AST metrike (neovisne o veličini)
"ast_depth": ast_depth,
"unique_node_type_ratio": unique_node_type_ratio,
"ast_nodes_per_line": ast_nodes_per_line,
# Funkcije (prosjeci su veΔ‡ neovisni o veličini)
"avg_function_length": avg_function_length,
"max_function_length": max_function_length,
"avg_args_per_function": avg_args_per_function,
# Gustoće — normalizirane po nepraznim linijama koda
"function_density": function_density,
"class_density": class_density,
"import_density": import_density,
"if_density": if_density,
"for_density": for_density,
"while_density": while_density,
"try_density": try_density,
"lambda_density": lambda_density,
# UgnijeΕΎΔ‘enost i sloΕΎenost (veΔ‡ neovisni o veličini)
"max_nesting_depth": max_nesting_depth,
"avg_nesting_depth": avg_nesting_depth,
"cyclomatic_complexity_approx": cyclomatic_approx,
# Entropija
"char_entropy": char_entropy,
"token_entropy": token_entropy,
}
# ─────────────────────────────────────────────────────────────────────────────
# NAČIN 3: STATISTIČKA DETEKCIJA (Perplexity)
# ─────────────────────────────────────────────────────────────────────────────
def extract_statistical_features(code: str, model=None, tokenizer=None) -> dict:
"""
Računa perplexity koda pomoΔ‡u jezičnog modela.
Niski perplexity β†’ model je "očekivao" kod β†’ vjerojatno AI.
Visoki perplexity β†’ model je "iznenaΔ‘en" β†’ vjerojatno čovjek.
Metoda je jezično-agnostična β€” isti model prima kod u bilo kojem jeziku.
Ako model nije proslijeđen (None), vraća -1 i ostatak pipeline-a nastavlja
normalno bez statističke značajke.
"""
if model is None or tokenizer is None:
return {"perplexity": -1.0, "model_available": 0}
try:
import torch
inputs = tokenizer(
code, return_tensors="pt", truncation=True, max_length=512
)
with torch.no_grad():
outputs = model(inputs["input_ids"], labels=inputs["input_ids"])
loss = outputs.loss
perplexity = math.exp(loss.item())
except Exception as e:
print(f" [UPOZORENJE] Perplexity nije izračunat: {e}")
perplexity = -1.0
return {"perplexity": perplexity, "model_available": 1}
# ─────────────────────────────────────────────────────────────────────────────
# KOMBINIRANA FUNKCIJA β€” ulazna točka
# ─────────────────────────────────────────────────────────────────────────────
def extract_all_features(
code: str,
language=None,
filename=None,
model=None,
tokenizer=None,
) -> dict:
"""
Izvlači SVE značajke iz koda u jednom pozivu.
Jezik se odreΔ‘uje ovim redoslijedom:
1. Argument language (ako je zadan)
2. Nastavak datoteke iz filename (ako je zadan)
3. Heuristike iz samog koda (automatska detekcija)
Parametri:
code (str): Izvorni kod kao string.
language (str|None): Naziv jezika (npr. "python", "java").
filename (str|None): Ime datoteke (npr. "main.py").
model: (opcionalno) HuggingFace model za perplexity.
tokenizer: (opcionalno) HuggingFace tokenizator.
Vraća:
dict: Sve značajke + ključ "detected_language".
"""
# OdreΔ‘ivanje jezika
if language is not None:
detected_lang = language.lower()
elif filename is not None:
detected_lang = (detect_language_from_extension(filename)
or detect_language_from_code(code))
else:
detected_lang = detect_language_from_code(code)
lang_config = get_config(detected_lang)
# Izvlačenje značajki po metodama
style_feats = extract_style_features(code, lang_config)
structural_feats = extract_structural_features(code, lang_config)
statistical_feats = extract_statistical_features(code, model, tokenizer)
return {
"detected_language": detected_lang,
**style_feats,
**structural_feats,
**statistical_feats,
}
# ─────────────────────────────────────────────────────────────────────────────
# BRZI TEST β€” pokreni: python feature_extraction.py
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
test_cases = {
"Python (AI)": ("python", '''
def calculate_average(numbers: list) -> float:
"""Calculate the arithmetic mean of a list of numbers."""
if not numbers:
raise ValueError("Cannot calculate average of an empty list.")
total_sum = sum(numbers)
count = len(numbers)
return total_sum / count
'''),
"Python (Human)": ("python", '''
def avg(nums):
# quick avg
return sum(nums) / len(nums)
'''),
"JavaScript (AI)": ("javascript", '''
/**
* Calculates the average of an array of numbers.
*/
function calculateAverage(numbers) {
if (!numbers || numbers.length === 0) {
throw new Error("Cannot calculate average of an empty array.");
}
const totalSum = numbers.reduce((acc, val) => acc + val, 0);
return totalSum / numbers.length;
}
'''),
"Java (AI)": ("java", '''
/**
* Calculates the average of an integer array.
*/
public class Calculator {
public static double calculateAverage(int[] numbers) {
if (numbers == null || numbers.length == 0) {
throw new IllegalArgumentException("Array must not be empty.");
}
int totalSum = 0;
for (int currentNumber : numbers) {
totalSum += currentNumber;
}
return (double) totalSum / numbers.length;
}
}
'''),
}
KEY_FEATURES = [
("Prepoznat jezik", "detected_language"),
("Omjer komentara", "comment_ratio"),
("Broj docstringova", "num_docstrings"),
("Prosj. duljina identif.", "avg_identifier_length"),
("Prosj. duljina fun. naziva", "avg_function_name_length"),
("Jednoslovna imena", "single_char_name_ratio"),
("Konzistentnost imenovanja", "naming_consistency"),
("Broj funkcija", "num_functions"),
("Prosj. duljina funkcije", "avg_function_length"),
("Max ugnijeΕΎΔ‘enost", "max_nesting_depth"),
("Aproks. sloΕΎenost (CC)", "cyclomatic_complexity_approx"),
("Perplexity", "perplexity"),
]
for label, (lang, code) in test_cases.items():
print(f"\n{'═' * 55}")
print(f" {label}")
print(f"{'═' * 55}")
features = extract_all_features(code, language=lang)
for display_name, key in KEY_FEATURES:
val = features.get(key, "N/A")
if isinstance(val, float):
print(f" {display_name:<32} {val:.4f}")
else:
print(f" {display_name:<32} {val}")
print(f"\n Ukupno značajki: {len(features)}")
# ─────────────────────────────────────────────────────────────────────────────
# ANALIZA PO LINIJAMA β€” za prikaz sumnjivih linija u UI-u
# ─────────────────────────────────────────────────────────────────────────────
def analyze_lines(code: str, language: str = None, filename: str = None) -> list:
"""
Analizira kod liniju po liniju i vraća listu sumnjivih linija.
Svaki element liste je rječnik:
{
"line": int, # broj linije (1-based)
"tone": str, # "red" = jak signal, "amber" = umjeren
"note": str, # kratko objaΕ‘njenje (prikazuje se u UI-u)
}
Detektira sljedeće AI signale po liniji:
- Docstringovi i formalni blok komentari (jak signal)
- Jednolinijski komentari (umjeren signal)
- Linije s dugačkim identifikatorima (umjeren signal)
- Type anotacije (umjeren signal)
- Try/except/raise/throw s formalnim porukama (umjeren signal)
- Linije s viΕ‘estrukim opisnim identifikatorima (jak signal)
Parametri:
code (str): Izvorni kod kao string.
language (str|None): Naziv jezika β€” ako None, automatski se detektira.
filename (str|None): Ime datoteke β€” pomaΕΎe detekciji jezika.
Vraća:
list: Lista rječnika s anotacijama, sortirana po broju linije.
"""
import re
if language is None:
if filename is not None:
language = detect_language_from_extension(filename) or detect_language_from_code(code)
else:
language = detect_language_from_code(code)
try:
lang_config = get_config(language)
except ValueError:
lang_config = get_config("python")
lines = code.splitlines()
annotations = []
in_docstring = False
docstring_char = None
inline_pat = lang_config.get("inline_comment", r"^\s*#")
for i, line in enumerate(lines, start=1):
stripped = line.strip()
if not stripped:
continue
# ── DOCSTRINGOVI / BLOK KOMENTARI ─────────────────────────────────
# Python docstringovi (""" ili ''')
if language == "python":
triple_count = stripped.count('"""') + stripped.count("'''")
if not in_docstring and ('"""' in stripped or "'''" in stripped):
docstring_char = '"""' if '"""' in stripped else "'''"
# Ako se otvara i zatvara na istoj liniji β†’ jednolinijski docstring
if stripped.count(docstring_char) >= 2 and len(stripped) > 6:
annotations.append({
"line": i, "tone": "red",
"note": "Formal docstring β€” strong AI indicator"
})
else:
in_docstring = True
annotations.append({
"line": i, "tone": "red",
"note": "Docstring block β€” strong AI indicator"
})
continue
elif in_docstring:
annotations.append({
"line": i, "tone": "red",
"note": "Docstring content"
})
if docstring_char and docstring_char in stripped:
in_docstring = False
continue
# Javadoc / JSDoc blokovi (/** ... */)
if language in ("java", "javascript", "typescript", "cpp", "c"):
if stripped.startswith("/**") or stripped.startswith("* ") or stripped == "*/":
annotations.append({
"line": i, "tone": "red",
"note": "Formal documentation comment β€” strong AI indicator"
})
continue
# ── JEDNOLINIJSKI KOMENTARI ────────────────────────────────────────
if re.match(inline_pat, line):
# Gledamo duljinu komentara β€” kratki (#) ne flagiramo
comment_text = re.sub(inline_pat, "", line).strip()
word_count = len(comment_text.split())
if word_count >= 4:
annotations.append({
"line": i, "tone": "amber",
"note": f"Inline comment ({word_count} words) β€” elevated comment density"
})
continue
# ── DUGAČKI IDENTIFIKATORI ─────────────────────────────────────────
# VAΕ½NO: Prije analize, uklonimo sadrΕΎaj string literala s linije.
# Bez ovoga, regex bi uhvatio i prirodne riječi unutar stringova
# (npr. "Ucitajte red matrice") kao identifikatore β€” Ε‘to je pogreΕ‘no.
code_only = re.sub(r'"[^"]*"', '""', stripped) # ukloni "..."
code_only = re.sub(r"'[^']*'", "''", code_only) # ukloni '...'
code_only = re.sub(r"`[^`]*`", "``", code_only) # ukloni `...`
identifiers = re.findall(r'\b([a-zA-Z_][a-zA-Z0-9_]{4,})\b', code_only)
# Preskačemo rezervirane riječi i uobičajene kratke stdlib nazive
reserved = {
"return", "import", "function", "class", "interface", "public",
"private", "static", "const", "false", "true", "none", "self",
"print", "printf", "scanf", "range", "raise", "while", "break",
"continue", "yield", "lambda", "assert", "except", "finally",
"include", "define", "string", "vector", "struct", "unsigned",
"length", "value", "write", "reads", "fopen", "fclose", "malloc",
"sizeof", "stdio", "stdlib", "nullptr", "virtual", "override",
"inline", "extern", "register", "volatile", "switch", "default",
}
real_ids = [x for x in identifiers if x.lower() not in reserved]
if len(real_ids) >= 2:
avg_len = sum(len(x) for x in real_ids) / len(real_ids)
if avg_len >= 9:
annotations.append({
"line": i, "tone": "red",
"note": f"Very long identifiers (avg {avg_len:.0f} chars) β€” AI naming pattern"
})
continue
elif avg_len >= 7:
annotations.append({
"line": i, "tone": "amber",
"note": f"Descriptive identifier names (avg {avg_len:.0f} chars)"
})
continue
# ── TYPE ANOTACIJE (Python) ────────────────────────────────────────
if language == "python":
if re.search(r'\)\s*->\s*\w', stripped) or re.search(r':\s*(int|float|str|bool|list|dict|tuple|set|Optional|Union|List|Dict)\b', stripped):
annotations.append({
"line": i, "tone": "amber",
"note": "Type annotation β€” uncommon in student code"
})
continue
# ── TRY/EXCEPT/RAISE S PORUKAMA ────────────────────────────────────
if re.match(r'^\s*(raise|throw)\s+\w*Error\s*\(', line) or \
re.match(r'^\s*(raise|throw)\s+\w*Exception\s*\(', line):
annotations.append({
"line": i, "tone": "amber",
"note": "Explicit exception with message β€” AI error handling pattern"
})
continue
if re.match(r'^\s*(except|catch)\s*[\(\w]', line):
annotations.append({
"line": i, "tone": "amber",
"note": "Exception handling β€” AI code often handles all edge cases"
})
continue
# Makni previΕ‘e sumnjivih linija β€” ako je >60% flagirano, to gubi smisao
# PrikaΕΎi samo najsumnjivije linije (max 40% koda)
total_nonblank = sum(1 for l in lines if l.strip())
max_annotations = max(3, int(total_nonblank * 0.40))
# Sortiraj: red prije amber, onda po broju linije
priority = {"red": 0, "amber": 1}
annotations.sort(key=lambda a: (priority.get(a["tone"], 2), a["line"]))
annotations = annotations[:max_annotations]
annotations.sort(key=lambda a: a["line"])
return annotations