Spaces:
Running
Running
| import os | |
| import json | |
| import logging | |
| import re | |
| import random | |
| from huggingface_hub import InferenceClient | |
| logger = logging.getLogger(__name__) | |
| class Planner: | |
| """ | |
| Agent 1: Semantic Planner | |
| EXTRACTS core ideas into a detailed JSON outline. | |
| ENSURES no extractive copying (paraphrasing) by checking overlap. | |
| """ | |
| def __init__(self, hf_token=None): | |
| self.token = hf_token or os.getenv("HF_TOKEN", "") | |
| self.client = InferenceClient(token=self.token) | |
| # Using Qwen2.5-7B-Instruct for strong JSON adherence | |
| self.model = "Qwen/Qwen2.5-7B-Instruct" | |
| def plan(self, text): | |
| logger.info("planner: extracting outline from %d chars", len(text)) | |
| for attempt in range(3): | |
| try: | |
| # 1. Generate Outline | |
| outline_json = self._generate_outline(text, attempt) | |
| # 2. Parse JSON | |
| plan = self._parse_json(outline_json) | |
| if not plan: | |
| continue | |
| # 3. Check Overlap (Enforce 'No Copying') | |
| overlap = self._check_overlap(text, plan) | |
| if overlap > 0.3: # Threshold for "too much copying" in outline | |
| logger.warning("planner: overlap %.2f too high, retrying...", overlap) | |
| continue | |
| logger.info("planner: success (overlap=%.2f)", overlap) | |
| return plan | |
| except Exception as exc: | |
| logger.error("planner attempt %d failed: %s", attempt, exc) | |
| # Fallback: Return simple structure if all else fails | |
| return { | |
| "topic": "General", | |
| "audience": "General", | |
| "tone": "Casual", | |
| "points": [{"claim": "Could not extract plan.", "support": [], "example_hint": ""}], | |
| "point_order_flex": True | |
| } | |
| def _generate_outline(self, text, attempt): | |
| system_content = ( | |
| "TODAY'S DATE: February 11, 2026.\n" | |
| "You are a technical data extraction specialist. Your goal is to extract ONLY hard facts from the source text.\n" | |
| "STRICT FIDELITY: You must mirror the source exactly. Assume the text is current as of 2026.\n" | |
| "1. THEMES: List 3 specific themes found ONLY in the provided text.\n" | |
| "2. DATA (MIRROR RULE): Extract specific statistics and names with EXACT WORDING. If the source says '42 percent', do not write '42%'.\n" | |
| "3. CHRONOLOGY: Entities must appear in a logical or chronological sequence.\n\n" | |
| "JSON Format:\n" | |
| "{\n" | |
| " \"topic\": \"...\",\n" | |
| " \"themes\": [\"...\", \"...\", \"...\"],\n" | |
| " \"entities\": [\"...\", \"...\"],\n" | |
| " \"points\": [\n" | |
| " {\n" | |
| " \"fact\": \"... (Use EXACT statistics and names from source)\",\n" | |
| " \"supporting_data\": [\"...\"],\n" | |
| " \"sequence_order\": 1\n" | |
| " }\n" | |
| " ]\n" | |
| "}\n" | |
| ) | |
| user_content = f"Extract hard facts from this text:\n\n{text}" | |
| if attempt > 0: | |
| user_content += "\n\nCRITICAL: BE MORE FACTUAL. REMOVE ALL ADJECTIVES. USE ONLY DATA." | |
| messages = [ | |
| {"role": "system", "content": system_content}, | |
| {"role": "user", "content": user_content} | |
| ] | |
| response = self.client.chat_completion( | |
| messages=messages, | |
| model=self.model, | |
| max_tokens=1024, | |
| temperature=0.7, # Lower temperature for extraction precision | |
| top_p=0.9 | |
| ) | |
| return response.choices[0].message.content | |
| def _parse_json(self, raw_text): | |
| try: | |
| # Cleanup markdown | |
| cleaned = raw_text.strip() | |
| if "```json" in cleaned: | |
| cleaned = cleaned.split("```json")[1].split("```")[0] | |
| elif "```" in cleaned: | |
| cleaned = cleaned.split("```")[1].split("```")[0] | |
| return json.loads(cleaned) | |
| except json.JSONDecodeError: | |
| logger.error("planner: failed to parse JSON output") | |
| return None | |
| def _check_overlap(self, original, plan): | |
| """ | |
| Calculates Jaccard overlap of bigrams between Original Text and Outline Values. | |
| """ | |
| def get_bigrams(s): | |
| words = re.findall(r'\w+', s.lower()) | |
| return set(zip(words, words[1:])) | |
| orig_bigrams = get_bigrams(original) | |
| # Collect all text from plan | |
| plan_text = "" | |
| plan_text += plan.get("topic", "") + " " | |
| for p in plan.get("points", []): | |
| plan_text += p.get("claim", "") + " " | |
| for s in p.get("support", []): | |
| plan_text += s + " " | |
| plan_bigrams = get_bigrams(plan_text) | |
| if not orig_bigrams or not plan_bigrams: | |
| return 0.0 | |
| intersection = len(orig_bigrams.intersection(plan_bigrams)) | |
| union = len(orig_bigrams.union(plan_bigrams)) | |
| return intersection / union if union > 0 else 0.0 | |