interns_manager / parser.py
banao-tech's picture
Update parser.py
2527c2a verified
"""
parser.py β€” ATG EOD report parser
Claude does two things:
1. Verifies this is a genuine EOD report (not a random bot mention)
2. Extracts all structured fields
Falls back to regex if Claude is unavailable.
"""
import os
import re
import json
from dataclasses import dataclass, field
RISK_FLAG_KEYWORDS = [
"stuck", "blocked", "can't proceed", "need help",
"behind", "delayed", "not sure", "struggling",
"at risk", "won't finish", "unable to",
]
@dataclass
class ParsedReport:
raw_text: str
is_eod_report: bool = False # Claude verified this is an actual EOD report
name: str = ""
report_date: str = ""
solution: str = "" # WHAT I SOLVED TODAY
tasks_in_progress: str = ""
blockers: str = ""
learned: str = ""
ai_tool: str = ""
ai_what_asked: str = "" # what they asked the AI
ai_changes: str = "" # what they changed from AI output
ai_chat_link: str = ""
plan_tomorrow: str = ""
plan_week: str = ""
confidence: str = ""
format_valid: bool = False
missing_fields: list = field(default_factory=list)
quality_score: int = 0
quality_flags: list = field(default_factory=list)
contains_risk_flag: bool = False
def parse_report(text: str) -> ParsedReport:
"""
Parse EOD report. Claude verifies + extracts.
Falls back to regex if Claude fails.
"""
# Strip bot mention tag before parsing
clean_text = re.sub(r'<@[A-Z0-9]+>', '', text).strip()
report = ParsedReport(raw_text=clean_text)
try:
extracted = _extract_with_claude(clean_text)
except Exception as e:
print(f"[parser] Claude failed: {e} β€” using regex fallback")
extracted = _extract_with_regex(clean_text)
extracted["is_eod_report"] = True # regex fallback assumes it is
# If Claude says this is not an EOD report β€” stop here
report.is_eod_report = extracted.get("is_eod_report", False)
if not report.is_eod_report:
print(f"[parser] Claude determined this is not an EOD report β€” ignoring")
return report
report.name = extracted.get("name", "")
report.report_date = extracted.get("date", "")
report.solution = extracted.get("solution", "")
report.tasks_in_progress = extracted.get("tasks_in_progress", "")
report.blockers = extracted.get("blockers", "")
report.learned = extracted.get("learned", "")
report.ai_tool = extracted.get("ai_tool", "")
report.ai_what_asked = extracted.get("ai_what_asked", "")
report.ai_changes = extracted.get("ai_changes", "")
report.ai_chat_link = extracted.get("ai_chat_link", "")
report.plan_tomorrow = extracted.get("plan_tomorrow", "")
report.plan_week = extracted.get("plan_week", "")
report.confidence = extracted.get("confidence", "")
# Validate mandatory fields
required = {
"name": report.name,
"solution": report.solution,
"learned": report.learned,
"ai_tool": report.ai_tool,
"plan_tomorrow": report.plan_tomorrow,
"confidence": report.confidence,
}
missing = [k for k, v in required.items() if not v or len(v.strip()) < 3]
# AI chat link β€” strictly required, must be a real URL starting with http(s)
link = report.ai_chat_link.strip().lower()
NOT_A_LINK = (
"cant", "can't", "cannot", "not available", "n/a", "na",
"not applicable", "internal", "vscode", "vs code", "editor",
"inline", "no link", "not shared", "private", "local",
)
is_valid_link = (
link.startswith("http://") or link.startswith("https://")
) and not any(bad in link for bad in NOT_A_LINK)
if not is_valid_link:
missing.append("ai_chat_link")
report.missing_fields = missing
report.format_valid = len(missing) == 0
if report.format_valid:
report.quality_score = _score(report)
report.quality_flags = _flags(report)
report.contains_risk_flag = _check_risk(report)
return report
def _extract_with_claude(text: str) -> dict:
"""
Claude does two things in one call:
1. Decides if this is a real EOD report
2. Extracts all fields if it is
"""
import anthropic
client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
prompt = f"""You are processing a message sent to an intern management bot.
First decide: is this a genuine EOD (End of Day) work report from an intern?
- YES if: it contains work tasks, learning, plans, blockers β€” even if format is incomplete
- NO if: it's a question, random message, test, greeting, or anything other than a work report
If YES, extract all available fields.
If NO, return {{"is_eod_report": false}} and nothing else.
Fields to extract (return empty string "" if not present):
- is_eod_report: true or false
- name: person's full name (ignore @ Slack tags)
- date: report date
- solution: content under WHAT I SOLVED TODAY (outcome, not tasks)
- tasks_in_progress: tasks in progress
- blockers: blockers
- learned: what they learned
- ai_tool: AI tool(s) used
- ai_what_asked: what they asked the AI
- ai_changes: what they changed or rejected from AI output
- ai_chat_link: shared chat URL β€” must start with http:// or https://. Return "" if value is N/A, none, not applicable, internal, "cant provide", "not available", "used in vscode", "used in editor", "copilot in vscode", or any explanation instead of a real URL. Only return an actual URL.
- plan_tomorrow: plan for tomorrow
- plan_week: plan for the week
- confidence: selected confidence level β€” one of: Crushing it, On track, Need help, Stuck
Return ONLY valid JSON. No prose. No markdown fences.
Message:
{text}"""
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1000,
messages=[{"role": "user", "content": prompt}],
)
raw = message.content[0].text.strip()
if raw.startswith("```"):
raw = re.sub(r"```(?:json)?", "", raw).replace("```", "").strip()
return json.loads(raw)
def _extract_with_regex(text: str) -> dict:
"""Fallback regex extraction."""
result = {"is_eod_report": True}
header = re.search(r"EOD REPORT\s*[β€”\-]+\s*(.+?)\s*[β€”\-]+\s*(.+)", text, re.IGNORECASE)
if header:
result["name"] = re.sub(r'<@[A-Z0-9]+>', '', header.group(1)).strip()
result["date"] = header.group(2).strip()
sections = [
("solution", r"WHAT I SOLVED TODAY", r"TASKS COMPLETED|TASKS IN PROGRESS|BLOCKERS"),
("tasks_in_progress",r"TASKS IN PROGRESS", r"BLOCKERS|WHAT I LEARNED"),
("blockers", r"BLOCKERS", r"WHAT I LEARNED"),
("learned", r"WHAT I LEARNED TODAY", r"AI USAGE|PLAN FOR"),
("ai_usage", r"AI USAGE TODAY", r"PLAN FOR TOMORROW"),
("plan_tomorrow", r"PLAN FOR TOMORROW", r"PLAN FOR THE WEEK|CONFIDENCE"),
("plan_week", r"PLAN FOR THE WEEK", r"CONFIDENCE"),
("confidence_raw", r"CONFIDENCE LEVEL", r"━━━|$"),
]
for key, start_pat, end_pat in sections:
start = re.search(start_pat, text, re.IGNORECASE)
if not start:
result[key] = ""
continue
end = re.search(end_pat, text[start.end():], re.IGNORECASE)
content = text[start.end(): start.end() + end.start()].strip() if end else text[start.end():].strip()
result[key] = content
# Parse AI sub-fields
ai = result.pop("ai_usage", "")
tool = re.search(r"tool used\s*:\s*(.+)", ai, re.IGNORECASE)
asked = re.search(r"what i asked\s*:\s*(.+)", ai, re.IGNORECASE)
changed = re.search(r"what i changed\s*:\s*(.+)", ai, re.IGNORECASE)
link = re.search(r"chat link\s*:\s*(.+)", ai, re.IGNORECASE)
result["ai_tool"] = tool.group(1).strip() if tool else ""
result["ai_what_asked"] = asked.group(1).strip() if asked else ""
result["ai_changes"] = changed.group(1).strip() if changed else ""
# Extract first valid URL from the chat link line
raw_link_line = link.group(1).strip() if link else ""
urls = re.findall(r'https?://[^\s]+', raw_link_line)
raw_link = urls[0] if urls else raw_link_line
result["ai_chat_link"] = "" if raw_link.lower() in ("n/a", "none", "not applicable", "internal", "-", "") else raw_link
# Parse confidence
conf_raw = result.pop("confidence_raw", "")
marked = re.search(r"\[x\]\s*(.+?)(?:\[|$)", conf_raw, re.IGNORECASE)
result["confidence"] = marked.group(1).strip() if marked else conf_raw[:30].strip()
return result
def _score(report: ParsedReport) -> int:
score = 3
# Reward solution section
if len(report.solution) > 80: score += 1
# Reward detailed learning
if len(report.learned) > 80: score += 1
# Reward AI critique
if len(report.ai_changes) > 50: score += 1
# Penalise missing/fake chat link
link = report.ai_chat_link.strip().lower()
if not link:
score -= 1
# Penalise copy-paste signals
if report.ai_changes:
lower = report.ai_changes.lower()
if any(x in lower for x in ["nothing", "used as is", "no changes", "accepted all", "kept everything"]):
score -= 2
return max(1, min(5, score))
def _flags(report: ParsedReport) -> list:
flags = []
if not report.solution or len(report.solution.strip()) < 20:
flags.append("no_solution_stated")
link = report.ai_chat_link.strip().lower()
if not link:
flags.append("no_ai_chat_link")
if report.ai_changes:
lower = report.ai_changes.lower()
if any(x in lower for x in ["nothing", "used as is", "no changes", "accepted all"]):
flags.append("possible_copy_paste")
if len(report.learned) < 30:
flags.append("shallow_learning")
if not report.plan_week or len(report.plan_week.strip()) < 10:
flags.append("no_weekly_plan")
conf = report.confidence.lower()
if "need help" in conf:
flags.append("needs_help")
if "stuck" in conf:
flags.append("stuck")
return flags
def _check_risk(report: ParsedReport) -> bool:
check = " ".join([report.blockers, report.confidence]).lower()
return any(kw in check for kw in RISK_FLAG_KEYWORDS)
def format_missing_fields_message(missing_fields: list, report_type: str = "atg") -> str:
field_labels = {
"name": "Your name in the header",
"solution": "WHAT I SOLVED TODAY β€” outcome not tasks",
"learned": "WHAT I LEARNED TODAY",
"ai_tool": "AI USAGE TODAY β†’ Tool used",
"ai_chat_link": "AI USAGE TODAY β†’ Chat link β€” must be a real https:// URL. If you used Copilot in VS Code, export or screenshot the conversation and share via a link.",
"plan_tomorrow": "PLAN FOR TOMORROW",
"confidence": "CONFIDENCE LEVEL",
}
missing_labels = [field_labels.get(f, f) for f in missing_fields]
fields_str = "\n".join(f"β€’ {label}" for label in missing_labels)
return (
f"Your report is missing required fields:\n\n"
f"{fields_str}\n\n"
f"Use this format:\n\n"
f"```\n"
f"@intern-management-agent\n"
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
f"EOD REPORT β€” Your Name β€” DD Mon YYYY\n"
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
f"WHAT I SOLVED TODAY\n"
f"- What problem did you actually solve? (outcome, not tasks)\n\n"
f"TASKS IN PROGRESS\n"
f"- What you started β€” expected completion: [date]\n\n"
f"BLOCKERS\n"
f"- What is stopping you and who you need / None\n\n"
f"WHAT I LEARNED TODAY\n"
f"- Specific concept β€” how will you apply it tomorrow?\n\n"
f"AI USAGE TODAY\n"
f"Tool used: Claude / ChatGPT / Copilot\n"
f"What I asked it: [specific task]\n"
f"What I changed from its output: [what you modified and why]\n"
f"Chat link: [mandatory β€” paste shared URL]\n\n"
f"PLAN FOR TOMORROW\n"
f"- Task with expected output\n\n"
f"PLAN FOR THE WEEK\n"
f"- What you aim to complete by end of week\n\n"
f"CONFIDENCE LEVEL\n"
f"> [x] Crushing it [ ] On track [ ] Need help [ ] Stuck\n"
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
f"```\n\n"
f"Tag @intern-management-agent when you resubmit."
)