Spaces:

DevilBits
/

auto-swe-agent-ui

Sleeping

App Files Files Community

auto-swe-agent-ui / indexing /parser.py

DevilBits

fix: enforce safe empty bounds for tracking data charts and match dataframe list alignments

6085b61 7 days ago

raw

history blame contribute delete

6.55 kB

	"""AST-based Python code parser.

	Extracts function/class/module chunks with docstrings, signatures, and
	line ranges from Python files. Uses stdlib ``ast`` — no external parser needed.
	"""

	from __future__ import annotations

	import ast
	import os
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import List, Optional, Set

	# Directories to skip when walking repositories
	IGNORE_DIRS: Set[str] = {
	".venv",
	"venv",
	"__pycache__",
	".git",
	".pytest_cache",
	"node_modules",
	".next",
	".mypy_cache",
	".tox",
	"build",
	"dist",
	"*.egg-info",
	".eggs",
	".git",
	"htmlcov",
	}


	@dataclass
	class CodeChunk:
	file_path: str
	chunk_type: str # "function", "class", "module"
	name: str
	signature: str
	docstring: str
	start_line: int
	end_line: int
	body_preview: str # first 500 chars of body
	full_text: str = "" # complete source of the chunk


	def _get_docstring(node: ast.AST) -> str:
	"""Extract docstring from an AST node body, if present."""
	if (
	node.body
	and isinstance(node.body[0], ast.Expr)
	and isinstance(node.body[0].value, ast.Constant)
	and isinstance(node.body[0].value.value, str)
	):
	return ast.get_docstring(node) or ""
	return ""


	def _chunk_text(file_text: str, node: ast.AST, start: int, end: int) -> tuple[str, str]:
	"""Return (body_preview, full_text) for a node's line range."""
	lines = file_text.splitlines(keepends=True)
	segment = "".join(lines[start - 1 : end])
	preview = segment[:500]
	return preview, segment


	def parse_file(filepath: str) -> List[CodeChunk]:
	"""Parse a single Python file into CodeChunks.

	Extracts:
	- Module-level docstring
	- Function definitions (name, args, docstring, line range, body preview)
	- Class definitions (name, methods, docstring, line range)
	"""
	path = Path(filepath)
	if not path.exists() or path.suffix != ".py":
	return []

	try:
	file_text = path.read_text(encoding="utf-8", errors="replace")
	except OSError:
	return []

	try:
	tree = ast.parse(file_text, filename=filepath)
	except SyntaxError:
	return []

	chunks: List[CodeChunk] = []

	# Module-level docstring
	mod_doc = _get_docstring(tree)
	if mod_doc:
	chunks.append(
	CodeChunk(
	file_path=filepath,
	chunk_type="module",
	name=path.stem,
	signature=f"module {path.stem}",
	docstring=mod_doc,
	start_line=1,
	end_line=len(file_text.splitlines()),
	body_preview=file_text[:500],
	full_text=file_text,
	)
	)

	for node in ast.walk(tree):
	if isinstance(node, ast.FunctionDef):
	sig = f"def {node.name}({', '.join(a.arg for a in node.args.args)}):"
	doc = _get_docstring(node)
	start = node.lineno
	end = node.end_lineno or start
	preview, full = _chunk_text(file_text, node, start, end)
	chunks.append(
	CodeChunk(
	file_path=filepath,
	chunk_type="function",
	name=node.name,
	signature=sig,
	docstring=doc,
	start_line=start,
	end_line=end,
	body_preview=preview,
	full_text=full,
	)
	)

	elif isinstance(node, ast.AsyncFunctionDef):
	sig = f"async def {node.name}({', '.join(a.arg for a in node.args.args)}):"
	doc = _get_docstring(node)
	start = node.lineno
	end = node.end_lineno or start
	preview, full = _chunk_text(file_text, node, start, end)
	chunks.append(
	CodeChunk(
	file_path=filepath,
	chunk_type="function",
	name=node.name,
	signature=sig,
	docstring=doc,
	start_line=start,
	end_line=end,
	body_preview=preview,
	full_text=full,
	)
	)

	elif isinstance(node, ast.ClassDef):
	bases = ", ".join(
	ast.dumps(b) if isinstance(b, ast.Name) else "" for b in node.bases
	)
	sig = f"class {node.name}({bases})" if bases else f"class {node.name}:"
	doc = _get_docstring(node)
	start = node.lineno
	end = node.end_lineno or start
	preview, full = _chunk_text(file_text, node, start, end)
	chunks.append(
	CodeChunk(
	file_path=filepath,
	chunk_type="class",
	name=node.name,
	signature=sig,
	docstring=doc,
	start_line=start,
	end_line=end,
	body_preview=preview,
	full_text=full,
	)
	)

	return chunks


	def parse_repository(repo_path: str) -> List[CodeChunk]:
	"""Walk repo_path and parse every Python file into CodeChunks."""
	chunks: List[CodeChunk] = []
	repo = Path(repo_path).resolve()

	for root, dirs, files in os.walk(repo):
	dirs[:] = [d for d in dirs if d not in IGNORE_DIRS]
	for fname in files:
	if not fname.endswith(".py"):
	continue
	fpath = os.path.join(root, fname)
	try:
	chunks.extend(parse_file(fpath))
	except Exception:
	continue

	return chunks


	def check_index_staleness(repo_path: str, index_path: str) -> bool:
	"""Check if any Python file is newer than the index file.

	Returns True if index is stale or missing.
	"""
	index_file = Path(index_path)
	if not index_file.exists():
	return True

	index_mtime = index_file.stat().st_mtime
	repo = Path(repo_path).resolve()

	for root, dirs, files in os.walk(repo):
	dirs[:] = [d for d in dirs if d not in IGNORE_DIRS]
	for fname in files:
	if not fname.endswith(".py"):
	continue
	fpath = os.path.join(root, fname)
	try:
	if os.path.getmtime(fpath) > index_mtime:
	return True
	except OSError:
	continue

	return False