Spaces:

karimic
/

ai-code-detector

Sleeping

ai-code-detector / feature_extraction.py

Karim Krklec

Obavljene optimizacije

a641ed5 24 days ago

35.6 kB

	"""
	feature_extraction.py
	=====================
	Višejezično izvlačenje značajki iz programskog koda.

	Podržani jezici: Python, JavaScript, TypeScript, Java, C, C++, Go, Rust, Ruby

	Organizirano po načinima detekcije:
	1. Stilska detekcija – komentari, imenovanje, formatiranje
	2. Strukturna detekcija – AST analiza, složenost, tok kontrole
	3. Statistička detekcija – perplexity pomoću jezičnog modela

	Glavna funkcija:
	extract_all_features(code, language=None, filename=None) -> dict
	"""

	import re
	import math
	from collections import Counter
	from tree_sitter import Language, Parser

	from language_config import (
	get_config,
	detect_language_from_code,
	detect_language_from_extension,
	)


	# ─────────────────────────────────────────────────────────────────────────────
	# POMOĆNE FUNKCIJE
	# ─────────────────────────────────────────────────────────────────────────────

	def _safe_divide(a: float, b: float, default: float = 0.0) -> float:
	"""Dijeljenje koje ne puca na nuli."""
	return a / b if b != 0 else default


	def _get_lines(code: str) -> list:
	"""Vraća sve linije koda kao listu stringova."""
	return code.splitlines()


	def _build_tree(code: str, ts_module):
	"""
	Parsira kod pomoću Tree-sitter i vraća stablo.
	Koristi modul specifičan za jezik (npr. tree_sitter_python).
	Vraća None ako parsiranje ne uspije.
	"""
	try:
	lang = Language(ts_module.language())
	parser = Parser(lang)
	return parser.parse(code.encode("utf-8", errors="replace"))
	except Exception:
	return None


	def _walk_tree(node) -> list:
	"""
	Prolazi kroz cijelo Tree-sitter stablo i vraća listu svih čvorova.
	Ekvivalent ast.walk() iz Pythonovog standardnog modula.
	"""
	result = [node]
	for child in node.children:
	result.extend(_walk_tree(child))
	return result


	def _count_node_types(all_nodes: list, type_names: list) -> int:
	"""Broji koliko se puta pojavljuje bilo koji od zadanih tipova čvorova."""
	if not type_names:
	return 0
	return sum(1 for n in all_nodes if n.type in type_names)


	def _get_node_depth(node, current: int = 0) -> int:
	"""Rekurzivno računa maksimalnu dubinu stabla."""
	if not node.children:
	return current
	return max(_get_node_depth(child, current + 1) for child in node.children)


	# ─────────────────────────────────────────────────────────────────────────────
	# NAČIN 1: STILSKA DETEKCIJA
	# ─────────────────────────────────────────────────────────────────────────────

	def extract_style_features(code: str, lang_config: dict) -> dict:
	"""
	Izvlači stilske značajke iz koda.

	Radi za sve jezike jer:
	- komentare prepoznaje regex obrascem iz lang_config
	(svaki jezik ima drugu sintaksu komentara)
	- identifikatore uzima iz Tree-sitter stabla
	(tree-sitter radi za sve podržane jezike)
	- formatiranje gleda direktno po linijama
	(potpuno universalno — vrijedi za sve jezike)

	Parametri:
	code (str): Izvorni kod kao string.
	lang_config (dict): Konfiguracija jezika iz language_config.py.

	Vraća:
	dict: Stilske značajke s float/int vrijednostima.
	"""
	lines = _get_lines(code)
	total_lines = len(lines) if lines else 1

	# ── KOMENTARI ──────────────────────────────────────────────────────────
	# Svaki jezik ima drugačiji simbol — Python koristi #, Java/JS koriste //
	# Regex obrazac je definiran u language_config.py za svaki jezik.

	inline_pat = lang_config["inline_comment"]
	comment_lines = [l for l in lines if re.match(inline_pat, l)]
	num_comment_lines = len(comment_lines)

	# Prosječna duljina komentara u riječima
	comment_words_total = sum(
	len(re.sub(inline_pat, "", l).strip().split())
	for l in comment_lines
	)
	avg_comment_length_words = _safe_divide(comment_words_total, num_comment_lines)

	# Blok komentari: /* ... */ u Java/JS/C, =begin...=end u Ruby
	num_block_comments = 0
	if lang_config["block_comment"]:
	start, end = lang_config["block_comment"]
	num_block_comments = len(re.findall(
	re.escape(start) + r"[\s\S]*?" + re.escape(end),
	code
	))

	# Ukupan udio komentara u znakovima
	total_comment_chars = sum(len(l) for l in comment_lines)
	comment_to_code_ratio = _safe_divide(total_comment_chars, max(len(code), 1))

	# Dokumentacijski komentari (docstring, JSDoc, Javadoc...)
	num_docstrings = 0
	if lang_config.get("docstring_pattern"):
	num_docstrings = len(re.findall(lang_config["docstring_pattern"], code))

	# ── IMENOVANJE (iz Tree-sitter stabla) ────────────────────────────────
	# Tree-sitter za svaki jezik daje čvorove tipa "identifier"
	# koji sadrže nazive varijabli, funkcija, argumenata itd.

	identifier_names = []
	function_names = []

	tree = _build_tree(code, lang_config["ts_module"])
	if tree:
	all_nodes = _walk_tree(tree.root_node)
	id_types = lang_config["node_types"].get("identifier", ["identifier"])
	fn_types = lang_config["node_types"].get("function", [])

	# Skupljamo sve identifikatore
	for node in all_nodes:
	if node.type in id_types and node.text:
	name = node.text.decode("utf-8", errors="replace")
	if len(name) >= 1:
	identifier_names.append(name)

	# Skupljamo nazive funkcija — tražimo "identifier" dijete
	# unutar čvora koji označava funkciju
	for node in all_nodes:
	if node.type in fn_types:
	for child in node.children:
	if child.type in id_types and child.text:
	fn_name = child.text.decode("utf-8", errors="replace")
	function_names.append(fn_name)
	break

	avg_identifier_length = _safe_divide(
	sum(len(n) for n in identifier_names), len(identifier_names)
	)
	avg_function_name_length = _safe_divide(
	sum(len(n) for n in function_names), len(function_names)
	)

	# Jednoslovna imena (i, x, n, k...) — čovječji kod ih ima više
	single_char_count = sum(1 for n in identifier_names if len(n) == 1)
	single_char_ratio = _safe_divide(single_char_count, len(identifier_names))

	# Leksička raznolikost: visoka = raznovrsni nazivi (čovjek), niska = AI ponavlja obrasce
	lexical_diversity = _safe_divide(
	len(set(identifier_names)), len(identifier_names)
	)

	# Konvencije imenovanja
	def is_snake_case(n):
	return "_" in n and n == n.lower() and not n.startswith("_")

	def is_camel_case(n):
	return (len(n) > 1 and not n.startswith("_")
	and n[0].islower() and any(c.isupper() for c in n)
	and "_" not in n)

	def is_pascal_case(n):
	return (len(n) > 1 and n[0].isupper()
	and any(c.islower() for c in n) and "_" not in n)

	total_ids = max(len(identifier_names), 1)
	snake_count = sum(1 for n in identifier_names if is_snake_case(n))
	camel_count = sum(1 for n in identifier_names if is_camel_case(n))
	pascal_count = sum(1 for n in identifier_names if is_pascal_case(n))

	snake_ratio = _safe_divide(snake_count, total_ids)
	camel_ratio = _safe_divide(camel_count, total_ids)

	# Konzistentnost imenovanja: 1.0 = svi identifikatori u istom stilu (tipično AI)
	naming_consistency = _safe_divide(
	max(snake_count, camel_count, pascal_count), total_ids
	)

	# ── FORMATIRANJE (potpuno universalno za sve jezike) ───────────────────

	non_empty_lines = [l for l in lines if l.strip()]
	empty_line_ratio = _safe_divide(total_lines - len(non_empty_lines), total_lines)

	line_lengths = [len(l) for l in non_empty_lines] if non_empty_lines else [0]
	avg_line_length = _safe_divide(sum(line_lengths), len(line_lengths))
	max_line_length = max(line_lengths) if line_lengths else 0

	# Tabovi vs razmaci za uvlačenje
	tab_lines = sum(1 for l in lines if l.startswith("\t"))
	uses_tabs = int(tab_lines > len(lines) * 0.1)

	# Trailing whitespace — razmaci na kraju linije
	trailing_ws = sum(1 for l in lines if l != l.rstrip())
	trailing_ws_ratio = _safe_divide(trailing_ws, total_lines)

	# Konzistentnost razmaka oko operatora (= == != < >...)
	with_space = len(re.findall(r"\s[=!<>]=?\s", code))
	without_space = len(re.findall(r"[^\s=!<>][=!<>]=[^\s=]", code))
	operator_consistency = _safe_divide(
	max(with_space, without_space), with_space + without_space
	)

	return {
	# Komentari
	"num_comment_lines": num_comment_lines,
	"comment_ratio": _safe_divide(num_comment_lines, total_lines),
	"avg_comment_length_words": avg_comment_length_words,
	"comment_to_code_ratio": comment_to_code_ratio,
	"num_block_comments": num_block_comments,
	"num_docstrings": num_docstrings,
	# Imenovanje
	"avg_identifier_length": avg_identifier_length,
	"avg_function_name_length": avg_function_name_length,
	"single_char_name_ratio": single_char_ratio,
	"lexical_diversity": lexical_diversity,
	"snake_case_ratio": snake_ratio,
	"camel_case_ratio": camel_ratio,
	"naming_consistency": naming_consistency,
	# Formatiranje
	"total_lines": total_lines,
	"empty_line_ratio": empty_line_ratio,
	"avg_line_length": avg_line_length,
	"max_line_length": max_line_length,
	"uses_tabs": uses_tabs,
	"trailing_whitespace_ratio": trailing_ws_ratio,
	"operator_spacing_consistency": operator_consistency,
	}


	# ─────────────────────────────────────────────────────────────────────────────
	# NAČIN 2: STRUKTURNA DETEKCIJA
	# ─────────────────────────────────────────────────────────────────────────────

	def extract_structural_features(code: str, lang_config: dict) -> dict:
	"""
	Izvlači strukturne značajke iz koda pomoću Tree-sitter AST analize.

	Tree-sitter radi za sve podržane jezike — jedina razlika su nazivi
	čvorova (npr. "function_definition" u Pythonu vs "method_declaration"
	u Javi), a to je riješeno kroz lang_config["node_types"].

	Parametri:
	code (str): Izvorni kod kao string.
	lang_config (dict): Konfiguracija jezika iz language_config.py.

	Vraća:
	dict: Strukturne značajke, ili rječnik nula ako parsiranje ne uspije.
	"""
	empty_result = {k: 0 for k in [
	"ast_depth", "ast_node_count", "unique_node_type_ratio",
	"num_functions", "avg_function_length", "max_function_length",
	"avg_args_per_function", "num_classes", "num_imports",
	"num_if_statements", "num_for_loops", "num_while_loops",
	"num_try_blocks", "num_lambdas",
	"max_nesting_depth", "avg_nesting_depth",
	"cyclomatic_complexity_approx",
	]}

	tree = _build_tree(code, lang_config["ts_module"])
	if tree is None:
	return empty_result

	root = tree.root_node
	all_nodes = _walk_tree(root)
	nt = lang_config["node_types"]

	# ── AST STABLO ─────────────────────────────────────────────────────────

	ast_depth = _get_node_depth(root)
	ast_node_count = len(all_nodes)
	node_type_counts = Counter(n.type for n in all_nodes)
	unique_node_type_ratio = _safe_divide(len(node_type_counts), ast_node_count)

	# ── FUNKCIJE ───────────────────────────────────────────────────────────

	function_nodes = [n for n in all_nodes if n.type in nt.get("function", [])]
	num_functions = len(function_nodes)

	# Duljina svake funkcije u linijama koda
	function_lengths = [
	fn.end_point[0] - fn.start_point[0] + 1
	for fn in function_nodes
	]
	avg_function_length = _safe_divide(sum(function_lengths), len(function_lengths))
	max_function_length = max(function_lengths) if function_lengths else 0

	# Broj parametara po funkciji
	args_counts = []
	for fn in function_nodes:
	for child in fn.children:
	if child.type in ("parameters", "formal_parameters",
	"parameter_list", "argument_list"):
	params = [c for c in child.children
	if c.type not in ("(", ")", ",", "self")]
	args_counts.append(len(params))
	break
	avg_args_per_function = _safe_divide(sum(args_counts), len(args_counts))

	# ── KLASE I IMPORTI ────────────────────────────────────────────────────

	num_classes = _count_node_types(all_nodes, nt.get("class", []))
	num_imports = _count_node_types(all_nodes, nt.get("import", []))

	# ── TOK KONTROLE ───────────────────────────────────────────────────────

	num_if = _count_node_types(all_nodes, nt.get("if", []))
	num_for = _count_node_types(all_nodes, nt.get("for", []))
	num_while = _count_node_types(all_nodes, nt.get("while", []))
	num_try = _count_node_types(all_nodes, nt.get("try", []))
	num_lambdas = _count_node_types(all_nodes, nt.get("lambda", []))

	# ── DUBINA UGNIJEŽĐENOSTI ──────────────────────────────────────────────

	nesting_types = set(
	nt.get("if", []) + nt.get("for", []) +
	nt.get("while", []) + nt.get("function", [])
	)

	def collect_depths(node, depth=0):
	depths = []
	for child in node.children:
	if child.type in nesting_types:
	depths.append(depth + 1)
	depths.extend(collect_depths(child, depth + 1))
	else:
	depths.extend(collect_depths(child, depth))
	return depths

	nesting_depths = collect_depths(root)
	max_nesting_depth = max(nesting_depths) if nesting_depths else 0
	avg_nesting_depth = _safe_divide(sum(nesting_depths), len(nesting_depths))

	# ── APROKSIMACIJA CIKLOMATSKE SLOŽENOSTI ──────────────────────────────
	# CC ≈ 1 + broj grananja, dijeljeno brojem funkcija
	# Standardna aproksimacija koja radi za sve jezike.

	total_branches = 1 + num_if + num_for + num_while + num_try
	cyclomatic_approx = (
	_safe_divide(total_branches, num_functions)
	if num_functions > 0
	else float(total_branches)
	)

	# ── NORMALIZACIJA PO VELIČINI KODA ────────────────────────────────────
	# Apsolutni brojevi (num_if, num_for...) ovise o veličini koda.
	# Dijeljenjem s brojem nepraznih linija dobivamo gustoću koja je
	# usporediva između kratkih i dugih kodova.
	# Npr. 3 if-a u 10 linija (0.30) vs 3 if-a u 100 linija (0.03)
	# — apsolutni broj je isti, ali gustoća govori pravu priču.

	lines_all = _get_lines(code)
	non_empty = max(sum(1 for l in lines_all if l.strip()), 1)

	if_density = _safe_divide(num_if, non_empty)
	for_density = _safe_divide(num_for, non_empty)
	while_density = _safe_divide(num_while, non_empty)
	try_density = _safe_divide(num_try, non_empty)
	function_density = _safe_divide(num_functions, non_empty)
	class_density = _safe_divide(num_classes, non_empty)
	import_density = _safe_divide(num_imports, non_empty)
	lambda_density = _safe_divide(num_lambdas, non_empty)

	# Broj AST čvorova po liniji — AI kod ima predvidljive strukture
	ast_nodes_per_line = _safe_divide(ast_node_count, non_empty)

	# ── ENTROPIJA KODA ────────────────────────────────────────────────────
	# Entropija mjeri raznolikost i nepredvidljivost koda.
	# Visoka entropija = raznolik, nepredvidljiv kod = vjerojatno čovjek
	# Niska entropija = ponavljajući, predvidljiv kod = možda AI
	#
	# Formula: H = -sum(p * log2(p)) za svaki jedinstveni element

	import math as _math
	from collections import Counter as _Counter

	# Entropija znakova — distribucija pojedinih znakova u kodu
	char_counts = _Counter(code)
	total_chars = len(code) if code else 1
	char_entropy = -sum(
	(c / total_chars) * _math.log2(c / total_chars)
	for c in char_counts.values()
	)

	# Entropija tokena — raznolikost na razini imenskih jedinica i simbola
	# Bolji signal od entropije znakova jer gleda smislene jezične jedinice
	tokens = re.findall(r"[a-zA-Z_]\w*\|[0-9]+\|[^\w\s]", code)
	token_counts = _Counter(tokens)
	total_tokens = len(tokens) if tokens else 1
	token_entropy = -sum(
	(c / total_tokens) * _math.log2(c / total_tokens)
	for c in token_counts.values()
	)

	return {
	# AST metrike (neovisne o veličini)
	"ast_depth": ast_depth,
	"unique_node_type_ratio": unique_node_type_ratio,
	"ast_nodes_per_line": ast_nodes_per_line,
	# Funkcije (prosjeci su već neovisni o veličini)
	"avg_function_length": avg_function_length,
	"max_function_length": max_function_length,
	"avg_args_per_function": avg_args_per_function,
	# Gustoće — normalizirane po nepraznim linijama koda
	"function_density": function_density,
	"class_density": class_density,
	"import_density": import_density,
	"if_density": if_density,
	"for_density": for_density,
	"while_density": while_density,
	"try_density": try_density,
	"lambda_density": lambda_density,
	# Ugniježđenost i složenost (već neovisni o veličini)
	"max_nesting_depth": max_nesting_depth,
	"avg_nesting_depth": avg_nesting_depth,
	"cyclomatic_complexity_approx": cyclomatic_approx,
	# Entropija
	"char_entropy": char_entropy,
	"token_entropy": token_entropy,
	}


	# ─────────────────────────────────────────────────────────────────────────────
	# NAČIN 3: STATISTIČKA DETEKCIJA (Perplexity)
	# ─────────────────────────────────────────────────────────────────────────────

	def extract_statistical_features(code: str, model=None, tokenizer=None) -> dict:
	"""
	Računa perplexity koda pomoću jezičnog modela.

	Niski perplexity → model je "očekivao" kod → vjerojatno AI.
	Visoki perplexity → model je "iznenađen" → vjerojatno čovjek.

	Metoda je jezično-agnostična — isti model prima kod u bilo kojem jeziku.
	Ako model nije proslijeđen (None), vraća -1 i ostatak pipeline-a nastavlja
	normalno bez statističke značajke.
	"""
	if model is None or tokenizer is None:
	return {"perplexity": -1.0, "model_available": 0}

	try:
	import torch

	inputs = tokenizer(
	code, return_tensors="pt", truncation=True, max_length=512
	)
	with torch.no_grad():
	outputs = model(inputs["input_ids"], labels=inputs["input_ids"])
	loss = outputs.loss

	perplexity = math.exp(loss.item())

	except Exception as e:
	print(f" [UPOZORENJE] Perplexity nije izračunat: {e}")
	perplexity = -1.0

	return {"perplexity": perplexity, "model_available": 1}


	# ─────────────────────────────────────────────────────────────────────────────
	# KOMBINIRANA FUNKCIJA — ulazna točka
	# ─────────────────────────────────────────────────────────────────────────────

	def extract_all_features(
	code: str,
	language=None,
	filename=None,
	model=None,
	tokenizer=None,
	) -> dict:
	"""
	Izvlači SVE značajke iz koda u jednom pozivu.

	Jezik se određuje ovim redoslijedom:
	1. Argument language (ako je zadan)
	2. Nastavak datoteke iz filename (ako je zadan)
	3. Heuristike iz samog koda (automatska detekcija)

	Parametri:
	code (str): Izvorni kod kao string.
	language (str\|None): Naziv jezika (npr. "python", "java").
	filename (str\|None): Ime datoteke (npr. "main.py").
	model: (opcionalno) HuggingFace model za perplexity.
	tokenizer: (opcionalno) HuggingFace tokenizator.

	Vraća:
	dict: Sve značajke + ključ "detected_language".
	"""
	# Određivanje jezika
	if language is not None:
	detected_lang = language.lower()
	elif filename is not None:
	detected_lang = (detect_language_from_extension(filename)
	or detect_language_from_code(code))
	else:
	detected_lang = detect_language_from_code(code)

	lang_config = get_config(detected_lang)

	# Izvlačenje značajki po metodama
	style_feats = extract_style_features(code, lang_config)
	structural_feats = extract_structural_features(code, lang_config)
	statistical_feats = extract_statistical_features(code, model, tokenizer)

	return {
	"detected_language": detected_lang,
	**style_feats,
	**structural_feats,
	**statistical_feats,
	}


	# ─────────────────────────────────────────────────────────────────────────────
	# BRZI TEST — pokreni: python feature_extraction.py
	# ─────────────────────────────────────────────────────────────────────────────

	if __name__ == "__main__":

	test_cases = {
	"Python (AI)": ("python", '''
	def calculate_average(numbers: list) -> float:
	"""Calculate the arithmetic mean of a list of numbers."""
	if not numbers:
	raise ValueError("Cannot calculate average of an empty list.")
	total_sum = sum(numbers)
	count = len(numbers)
	return total_sum / count
	'''),
	"Python (Human)": ("python", '''
	def avg(nums):
	# quick avg
	return sum(nums) / len(nums)
	'''),
	"JavaScript (AI)": ("javascript", '''
	/**
	* Calculates the average of an array of numbers.
	*/
	function calculateAverage(numbers) {
	if (!numbers \|\| numbers.length === 0) {
	throw new Error("Cannot calculate average of an empty array.");
	}
	const totalSum = numbers.reduce((acc, val) => acc + val, 0);
	return totalSum / numbers.length;
	}
	'''),
	"Java (AI)": ("java", '''
	/**
	* Calculates the average of an integer array.
	*/
	public class Calculator {
	public static double calculateAverage(int[] numbers) {
	if (numbers == null \|\| numbers.length == 0) {
	throw new IllegalArgumentException("Array must not be empty.");
	}
	int totalSum = 0;
	for (int currentNumber : numbers) {
	totalSum += currentNumber;
	}
	return (double) totalSum / numbers.length;
	}
	}
	'''),
	}

	KEY_FEATURES = [
	("Prepoznat jezik", "detected_language"),
	("Omjer komentara", "comment_ratio"),
	("Broj docstringova", "num_docstrings"),
	("Prosj. duljina identif.", "avg_identifier_length"),
	("Prosj. duljina fun. naziva", "avg_function_name_length"),
	("Jednoslovna imena", "single_char_name_ratio"),
	("Konzistentnost imenovanja", "naming_consistency"),
	("Broj funkcija", "num_functions"),
	("Prosj. duljina funkcije", "avg_function_length"),
	("Max ugniježđenost", "max_nesting_depth"),
	("Aproks. složenost (CC)", "cyclomatic_complexity_approx"),
	("Perplexity", "perplexity"),
	]

	for label, (lang, code) in test_cases.items():
	print(f"\n{'═' * 55}")
	print(f" {label}")
	print(f"{'═' * 55}")
	features = extract_all_features(code, language=lang)
	for display_name, key in KEY_FEATURES:
	val = features.get(key, "N/A")
	if isinstance(val, float):
	print(f" {display_name:<32} {val:.4f}")
	else:
	print(f" {display_name:<32} {val}")

	print(f"\n Ukupno značajki: {len(features)}")


	# ─────────────────────────────────────────────────────────────────────────────
	# ANALIZA PO LINIJAMA — za prikaz sumnjivih linija u UI-u
	# ─────────────────────────────────────────────────────────────────────────────

	def analyze_lines(code: str, language: str = None, filename: str = None) -> list:
	"""
	Analizira kod liniju po liniju i vraća listu sumnjivih linija.

	Svaki element liste je rječnik:
	{
	"line": int, # broj linije (1-based)
	"tone": str, # "red" = jak signal, "amber" = umjeren
	"note": str, # kratko objašnjenje (prikazuje se u UI-u)
	}

	Detektira sljedeće AI signale po liniji:
	- Docstringovi i formalni blok komentari (jak signal)
	- Jednolinijski komentari (umjeren signal)
	- Linije s dugačkim identifikatorima (umjeren signal)
	- Type anotacije (umjeren signal)
	- Try/except/raise/throw s formalnim porukama (umjeren signal)
	- Linije s višestrukim opisnim identifikatorima (jak signal)

	Parametri:
	code (str): Izvorni kod kao string.
	language (str\|None): Naziv jezika — ako None, automatski se detektira.
	filename (str\|None): Ime datoteke — pomaže detekciji jezika.

	Vraća:
	list: Lista rječnika s anotacijama, sortirana po broju linije.
	"""
	import re

	if language is None:
	if filename is not None:
	language = detect_language_from_extension(filename) or detect_language_from_code(code)
	else:
	language = detect_language_from_code(code)

	try:
	lang_config = get_config(language)
	except ValueError:
	lang_config = get_config("python")

	lines = code.splitlines()
	annotations = []
	in_docstring = False
	docstring_char = None

	inline_pat = lang_config.get("inline_comment", r"^\s*#")

	for i, line in enumerate(lines, start=1):
	stripped = line.strip()
	if not stripped:
	continue

	# ── DOCSTRINGOVI / BLOK KOMENTARI ─────────────────────────────────
	# Python docstringovi (""" ili ''')
	if language == "python":
	triple_count = stripped.count('"""') + stripped.count("'''")
	if not in_docstring and ('"""' in stripped or "'''" in stripped):
	docstring_char = '"""' if '"""' in stripped else "'''"
	# Ako se otvara i zatvara na istoj liniji → jednolinijski docstring
	if stripped.count(docstring_char) >= 2 and len(stripped) > 6:
	annotations.append({
	"line": i, "tone": "red",
	"note": "Formal docstring — strong AI indicator"
	})
	else:
	in_docstring = True
	annotations.append({
	"line": i, "tone": "red",
	"note": "Docstring block — strong AI indicator"
	})
	continue
	elif in_docstring:
	annotations.append({
	"line": i, "tone": "red",
	"note": "Docstring content"
	})
	if docstring_char and docstring_char in stripped:
	in_docstring = False
	continue

	# Javadoc / JSDoc blokovi (/** ... */)
	if language in ("java", "javascript", "typescript", "cpp", "c"):
	if stripped.startswith("/*") or stripped.startswith(" ") or stripped == "*/":
	annotations.append({
	"line": i, "tone": "red",
	"note": "Formal documentation comment — strong AI indicator"
	})
	continue

	# ── JEDNOLINIJSKI KOMENTARI ────────────────────────────────────────
	if re.match(inline_pat, line):
	# Gledamo duljinu komentara — kratki (#) ne flagiramo
	comment_text = re.sub(inline_pat, "", line).strip()
	word_count = len(comment_text.split())
	if word_count >= 4:
	annotations.append({
	"line": i, "tone": "amber",
	"note": f"Inline comment ({word_count} words) — elevated comment density"
	})
	continue

	# ── DUGAČKI IDENTIFIKATORI ─────────────────────────────────────────
	# VAŽNO: Prije analize, uklonimo sadržaj string literala s linije.
	# Bez ovoga, regex bi uhvatio i prirodne riječi unutar stringova
	# (npr. "Ucitajte red matrice") kao identifikatore — što je pogrešno.
	code_only = re.sub(r'"[^"]*"', '""', stripped) # ukloni "..."
	code_only = re.sub(r"'[^']*'", "''", code_only) # ukloni '...'
	code_only = re.sub(r"`[^`]*`", "``", code_only) # ukloni `...`

	identifiers = re.findall(r'\b([a-zA-Z_][a-zA-Z0-9_]{4,})\b', code_only)

	# Preskačemo rezervirane riječi i uobičajene kratke stdlib nazive
	reserved = {
	"return", "import", "function", "class", "interface", "public",
	"private", "static", "const", "false", "true", "none", "self",
	"print", "printf", "scanf", "range", "raise", "while", "break",
	"continue", "yield", "lambda", "assert", "except", "finally",
	"include", "define", "string", "vector", "struct", "unsigned",
	"length", "value", "write", "reads", "fopen", "fclose", "malloc",
	"sizeof", "stdio", "stdlib", "nullptr", "virtual", "override",
	"inline", "extern", "register", "volatile", "switch", "default",
	}
	real_ids = [x for x in identifiers if x.lower() not in reserved]

	if len(real_ids) >= 2:
	avg_len = sum(len(x) for x in real_ids) / len(real_ids)
	if avg_len >= 9:
	annotations.append({
	"line": i, "tone": "red",
	"note": f"Very long identifiers (avg {avg_len:.0f} chars) — AI naming pattern"
	})
	continue
	elif avg_len >= 7:
	annotations.append({
	"line": i, "tone": "amber",
	"note": f"Descriptive identifier names (avg {avg_len:.0f} chars)"
	})
	continue

	# ── TYPE ANOTACIJE (Python) ────────────────────────────────────────
	if language == "python":
	if re.search(r'\)\s->\s\w', stripped) or re.search(r':\s*(int\|float\|str\|bool\|list\|dict\|tuple\|set\|Optional\|Union\|List\|Dict)\b', stripped):
	annotations.append({
	"line": i, "tone": "amber",
	"note": "Type annotation — uncommon in student code"
	})
	continue

	# ── TRY/EXCEPT/RAISE S PORUKAMA ────────────────────────────────────
	if re.match(r'^\s(raise\|throw)\s+\wError\s*\(', line) or \
	re.match(r'^\s(raise\|throw)\s+\wException\s*\(', line):
	annotations.append({
	"line": i, "tone": "amber",
	"note": "Explicit exception with message — AI error handling pattern"
	})
	continue

	if re.match(r'^\s(except\|catch)\s[\(\w]', line):
	annotations.append({
	"line": i, "tone": "amber",
	"note": "Exception handling — AI code often handles all edge cases"
	})
	continue

	# Makni previše sumnjivih linija — ako je >60% flagirano, to gubi smisao
	# Prikaži samo najsumnjivije linije (max 40% koda)
	total_nonblank = sum(1 for l in lines if l.strip())
	max_annotations = max(3, int(total_nonblank * 0.40))

	# Sortiraj: red prije amber, onda po broju linije
	priority = {"red": 0, "amber": 1}
	annotations.sort(key=lambda a: (priority.get(a["tone"], 2), a["line"]))
	annotations = annotations[:max_annotations]
	annotations.sort(key=lambda a: a["line"])

	return annotations