Spaces:

banao-tech
/

interns_manager

Running

App Files Files Community

interns_manager / parser.py

banao-tech

Update parser.py

2527c2a verified 5 days ago

raw

history blame contribute delete

12.8 kB

	"""
	parser.py — ATG EOD report parser
	Claude does two things:
	1. Verifies this is a genuine EOD report (not a random bot mention)
	2. Extracts all structured fields
	Falls back to regex if Claude is unavailable.
	"""

	import os
	import re
	import json
	from dataclasses import dataclass, field


	RISK_FLAG_KEYWORDS = [
	"stuck", "blocked", "can't proceed", "need help",
	"behind", "delayed", "not sure", "struggling",
	"at risk", "won't finish", "unable to",
	]


	@dataclass
	class ParsedReport:
	raw_text: str
	is_eod_report: bool = False # Claude verified this is an actual EOD report
	name: str = ""
	report_date: str = ""
	solution: str = "" # WHAT I SOLVED TODAY
	tasks_in_progress: str = ""
	blockers: str = ""
	learned: str = ""
	ai_tool: str = ""
	ai_what_asked: str = "" # what they asked the AI
	ai_changes: str = "" # what they changed from AI output
	ai_chat_link: str = ""
	plan_tomorrow: str = ""
	plan_week: str = ""
	confidence: str = ""
	format_valid: bool = False
	missing_fields: list = field(default_factory=list)
	quality_score: int = 0
	quality_flags: list = field(default_factory=list)
	contains_risk_flag: bool = False


	def parse_report(text: str) -> ParsedReport:
	"""
	Parse EOD report. Claude verifies + extracts.
	Falls back to regex if Claude fails.
	"""
	# Strip bot mention tag before parsing
	clean_text = re.sub(r'<@[A-Z0-9]+>', '', text).strip()
	report = ParsedReport(raw_text=clean_text)

	try:
	extracted = _extract_with_claude(clean_text)
	except Exception as e:
	print(f"[parser] Claude failed: {e} — using regex fallback")
	extracted = _extract_with_regex(clean_text)
	extracted["is_eod_report"] = True # regex fallback assumes it is

	# If Claude says this is not an EOD report — stop here
	report.is_eod_report = extracted.get("is_eod_report", False)
	if not report.is_eod_report:
	print(f"[parser] Claude determined this is not an EOD report — ignoring")
	return report

	report.name = extracted.get("name", "")
	report.report_date = extracted.get("date", "")
	report.solution = extracted.get("solution", "")
	report.tasks_in_progress = extracted.get("tasks_in_progress", "")
	report.blockers = extracted.get("blockers", "")
	report.learned = extracted.get("learned", "")
	report.ai_tool = extracted.get("ai_tool", "")
	report.ai_what_asked = extracted.get("ai_what_asked", "")
	report.ai_changes = extracted.get("ai_changes", "")
	report.ai_chat_link = extracted.get("ai_chat_link", "")
	report.plan_tomorrow = extracted.get("plan_tomorrow", "")
	report.plan_week = extracted.get("plan_week", "")
	report.confidence = extracted.get("confidence", "")

	# Validate mandatory fields
	required = {
	"name": report.name,
	"solution": report.solution,
	"learned": report.learned,
	"ai_tool": report.ai_tool,
	"plan_tomorrow": report.plan_tomorrow,
	"confidence": report.confidence,
	}
	missing = [k for k, v in required.items() if not v or len(v.strip()) < 3]

	# AI chat link — strictly required, must be a real URL starting with http(s)
	link = report.ai_chat_link.strip().lower()
	NOT_A_LINK = (
	"cant", "can't", "cannot", "not available", "n/a", "na",
	"not applicable", "internal", "vscode", "vs code", "editor",
	"inline", "no link", "not shared", "private", "local",
	)
	is_valid_link = (
	link.startswith("http://") or link.startswith("https://")
	) and not any(bad in link for bad in NOT_A_LINK)

	if not is_valid_link:
	missing.append("ai_chat_link")

	report.missing_fields = missing
	report.format_valid = len(missing) == 0

	if report.format_valid:
	report.quality_score = _score(report)
	report.quality_flags = _flags(report)
	report.contains_risk_flag = _check_risk(report)

	return report


	def _extract_with_claude(text: str) -> dict:
	"""
	Claude does two things in one call:
	1. Decides if this is a real EOD report
	2. Extracts all fields if it is
	"""
	import anthropic
	client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])

	prompt = f"""You are processing a message sent to an intern management bot.

	First decide: is this a genuine EOD (End of Day) work report from an intern?
	- YES if: it contains work tasks, learning, plans, blockers — even if format is incomplete
	- NO if: it's a question, random message, test, greeting, or anything other than a work report

	If YES, extract all available fields.
	If NO, return {{"is_eod_report": false}} and nothing else.

	Fields to extract (return empty string "" if not present):
	- is_eod_report: true or false
	- name: person's full name (ignore @ Slack tags)
	- date: report date
	- solution: content under WHAT I SOLVED TODAY (outcome, not tasks)
	- tasks_in_progress: tasks in progress
	- blockers: blockers
	- learned: what they learned
	- ai_tool: AI tool(s) used
	- ai_what_asked: what they asked the AI
	- ai_changes: what they changed or rejected from AI output
	- ai_chat_link: shared chat URL — must start with http:// or https://. Return "" if value is N/A, none, not applicable, internal, "cant provide", "not available", "used in vscode", "used in editor", "copilot in vscode", or any explanation instead of a real URL. Only return an actual URL.
	- plan_tomorrow: plan for tomorrow
	- plan_week: plan for the week
	- confidence: selected confidence level — one of: Crushing it, On track, Need help, Stuck

	Return ONLY valid JSON. No prose. No markdown fences.

	Message:
	{text}"""

	message = client.messages.create(
	model="claude-sonnet-4-20250514",
	max_tokens=1000,
	messages=[{"role": "user", "content": prompt}],
	)

	raw = message.content[0].text.strip()
	if raw.startswith("```"):
	raw = re.sub(r"```(?:json)?", "", raw).replace("```", "").strip()

	return json.loads(raw)


	def _extract_with_regex(text: str) -> dict:
	"""Fallback regex extraction."""
	result = {"is_eod_report": True}

	header = re.search(r"EOD REPORT\s[—\-]+\s(.+?)\s[—\-]+\s(.+)", text, re.IGNORECASE)
	if header:
	result["name"] = re.sub(r'<@[A-Z0-9]+>', '', header.group(1)).strip()
	result["date"] = header.group(2).strip()

	sections = [
	("solution", r"WHAT I SOLVED TODAY", r"TASKS COMPLETED\|TASKS IN PROGRESS\|BLOCKERS"),
	("tasks_in_progress",r"TASKS IN PROGRESS", r"BLOCKERS\|WHAT I LEARNED"),
	("blockers", r"BLOCKERS", r"WHAT I LEARNED"),
	("learned", r"WHAT I LEARNED TODAY", r"AI USAGE\|PLAN FOR"),
	("ai_usage", r"AI USAGE TODAY", r"PLAN FOR TOMORROW"),
	("plan_tomorrow", r"PLAN FOR TOMORROW", r"PLAN FOR THE WEEK\|CONFIDENCE"),
	("plan_week", r"PLAN FOR THE WEEK", r"CONFIDENCE"),
	("confidence_raw", r"CONFIDENCE LEVEL", r"━━━\|$"),
	]

	for key, start_pat, end_pat in sections:
	start = re.search(start_pat, text, re.IGNORECASE)
	if not start:
	result[key] = ""
	continue
	end = re.search(end_pat, text[start.end():], re.IGNORECASE)
	content = text[start.end(): start.end() + end.start()].strip() if end else text[start.end():].strip()
	result[key] = content

	# Parse AI sub-fields
	ai = result.pop("ai_usage", "")
	tool = re.search(r"tool used\s:\s(.+)", ai, re.IGNORECASE)
	asked = re.search(r"what i asked\s:\s(.+)", ai, re.IGNORECASE)
	changed = re.search(r"what i changed\s:\s(.+)", ai, re.IGNORECASE)
	link = re.search(r"chat link\s:\s(.+)", ai, re.IGNORECASE)
	result["ai_tool"] = tool.group(1).strip() if tool else ""
	result["ai_what_asked"] = asked.group(1).strip() if asked else ""
	result["ai_changes"] = changed.group(1).strip() if changed else ""

	# Extract first valid URL from the chat link line
	raw_link_line = link.group(1).strip() if link else ""
	urls = re.findall(r'https?://[^\s]+', raw_link_line)
	raw_link = urls[0] if urls else raw_link_line
	result["ai_chat_link"] = "" if raw_link.lower() in ("n/a", "none", "not applicable", "internal", "-", "") else raw_link

	# Parse confidence
	conf_raw = result.pop("confidence_raw", "")
	marked = re.search(r"\[x\]\s*(.+?)(?:\[\|$)", conf_raw, re.IGNORECASE)
	result["confidence"] = marked.group(1).strip() if marked else conf_raw[:30].strip()

	return result


	def _score(report: ParsedReport) -> int:
	score = 3

	# Reward solution section
	if len(report.solution) > 80: score += 1

	# Reward detailed learning
	if len(report.learned) > 80: score += 1

	# Reward AI critique
	if len(report.ai_changes) > 50: score += 1

	# Penalise missing/fake chat link
	link = report.ai_chat_link.strip().lower()
	if not link:
	score -= 1

	# Penalise copy-paste signals
	if report.ai_changes:
	lower = report.ai_changes.lower()
	if any(x in lower for x in ["nothing", "used as is", "no changes", "accepted all", "kept everything"]):
	score -= 2

	return max(1, min(5, score))


	def _flags(report: ParsedReport) -> list:
	flags = []

	if not report.solution or len(report.solution.strip()) < 20:
	flags.append("no_solution_stated")

	link = report.ai_chat_link.strip().lower()
	if not link:
	flags.append("no_ai_chat_link")

	if report.ai_changes:
	lower = report.ai_changes.lower()
	if any(x in lower for x in ["nothing", "used as is", "no changes", "accepted all"]):
	flags.append("possible_copy_paste")

	if len(report.learned) < 30:
	flags.append("shallow_learning")

	if not report.plan_week or len(report.plan_week.strip()) < 10:
	flags.append("no_weekly_plan")

	conf = report.confidence.lower()
	if "need help" in conf:
	flags.append("needs_help")
	if "stuck" in conf:
	flags.append("stuck")

	return flags


	def _check_risk(report: ParsedReport) -> bool:
	check = " ".join([report.blockers, report.confidence]).lower()
	return any(kw in check for kw in RISK_FLAG_KEYWORDS)


	def format_missing_fields_message(missing_fields: list, report_type: str = "atg") -> str:
	field_labels = {
	"name": "Your name in the header",
	"solution": "WHAT I SOLVED TODAY — outcome not tasks",
	"learned": "WHAT I LEARNED TODAY",
	"ai_tool": "AI USAGE TODAY → Tool used",
	"ai_chat_link": "AI USAGE TODAY → Chat link — must be a real https:// URL. If you used Copilot in VS Code, export or screenshot the conversation and share via a link.",
	"plan_tomorrow": "PLAN FOR TOMORROW",
	"confidence": "CONFIDENCE LEVEL",
	}

	missing_labels = [field_labels.get(f, f) for f in missing_fields]
	fields_str = "\n".join(f"• {label}" for label in missing_labels)

	return (
	f"Your report is missing required fields:\n\n"
	f"{fields_str}\n\n"
	f"Use this format:\n\n"
	f"```\n"
	f"@intern-management-agent\n"
	f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
	f"EOD REPORT — Your Name — DD Mon YYYY\n"
	f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
	f"WHAT I SOLVED TODAY\n"
	f"- What problem did you actually solve? (outcome, not tasks)\n\n"
	f"TASKS IN PROGRESS\n"
	f"- What you started — expected completion: [date]\n\n"
	f"BLOCKERS\n"
	f"- What is stopping you and who you need / None\n\n"
	f"WHAT I LEARNED TODAY\n"
	f"- Specific concept — how will you apply it tomorrow?\n\n"
	f"AI USAGE TODAY\n"
	f"Tool used: Claude / ChatGPT / Copilot\n"
	f"What I asked it: [specific task]\n"
	f"What I changed from its output: [what you modified and why]\n"
	f"Chat link: [mandatory — paste shared URL]\n\n"
	f"PLAN FOR TOMORROW\n"
	f"- Task with expected output\n\n"
	f"PLAN FOR THE WEEK\n"
	f"- What you aim to complete by end of week\n\n"
	f"CONFIDENCE LEVEL\n"
	f"> [x] Crushing it [ ] On track [ ] Need help [ ] Stuck\n"
	f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
	f"```\n\n"
	f"Tag @intern-management-agent when you resubmit."
	)