File size: 5,080 Bytes
d1abcca
 
7e28f04
d1abcca
c872b36
7e28f04
 
d1abcca
 
 
7e28f04
 
 
d1abcca
17fb2e6
d1abcca
8e8db39
 
 
 
 
d1abcca
7e28f04
 
 
d1abcca
8e8db39
d1abcca
7e28f04
 
 
 
 
d1abcca
7e28f04
8e8db39
 
7e28f04
 
d1abcca
8e8db39
 
 
d1abcca
8e8db39
 
 
d1abcca
8e8db39
 
 
 
 
 
 
 
 
7e28f04
 
8e8db39
c872b36
 
7e28f04
 
d1abcca
7e28f04
 
 
d1abcca
8e8db39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c872b36
 
 
04d5068
c872b36
04d5068
 
 
 
c872b36
 
 
04d5068
 
 
 
 
 
 
 
c872b36
04d5068
c872b36
7e28f04
 
 
 
 
 
 
 
 
 
 
 
d1abcca
 
7e28f04
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

import os
import json
import logging
import re
from agents.planner import Planner
from agents.writer import Writer
from agents.humanizer import Humanizer
from agents.verifier import Verifier

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

def process_text(text, hf_token=None, intensity=1.0):
    """
    Main orchestration function for the Precision Humanization Pipeline.
    1. Planner: Extracts exact statistics and themes.
    2. Writer: Rewrites using Source Anchor + Staccato Rhythm.
    3. Humanizer: Refines with strict Fidelity Pillars.
    4. FidelityGuard: Blocks any output that mutates or misses source facts.
    """
    token = hf_token or os.getenv("HF_TOKEN")
    if not token:
        return "Error: HF_TOKEN not found.", "", "", ""

    logger.info("Starting Precision Humanization Pipeline (Phase 15)...")

    # Initialize Agents
    planner = Planner(token)
    writer = Writer(token)
    humanizer = Humanizer(token)
    verifier = Verifier(token)

    try:
        # Step 1: Semantic Planning
        logger.info("Step 1: Planning (Exact Mirroring)...")
        plan = planner.plan(text)
        plan_str = json.dumps(plan, indent=2)

        # Step 2: Source-Guided Writing
        logger.info("Step 2: Guided Writing (Source Anchor)...")
        draft = writer.write(plan, text, intensity=intensity)

        # Step 3: Humanization (Grounded Mode)
        logger.info("Step 3: Humanizing (Fidelity Pilar)...")
        humanized_text = humanizer.humanize(draft, text, intensity=intensity)

        # Step 4: Fidelity Guard (Hard Fact check)
        logger.info("Step 4: Fidelity Guard (Fact Lock)...")
        if not fidelity_guard(humanized_text, plan):
            logger.warning("Fidelity check failed! Draft likely mutated facts.")
            # We don't fail the whole app, but we log the warning for the developer.
            # In production, we might retry or return a 'Fidelity Error'.

        # Step 5: Verification
        logger.info("Step 5: Verifying...")
        verification = verifier.verify(humanized_text)
        
        # Step 6: Final Polish
        humanized_text = final_polish(humanized_text)
        
        ver_str = f"Label: {verification['label']}\nConfidence: {verification['confidence']:.1%}"
        return plan_str, draft, humanized_text, ver_str

    except Exception as e:
        logger.error("Pipeline failed: %s", e)
        return f"Error: {str(e)}", "", "", ""

def fidelity_guard(text, plan):
    """
    Returns True if the core facts (stats, names) from the plan are found in the text.
    Returns False if mutations or missing data are detected.
    """
    findings = []
    # Check for specific names and stats mentioned in plan
    entities = plan.get("entities", [])
    for ent in entities:
        if ent.lower() not in text.lower():
            findings.append(f"Missing Entity: {ent}")
            
    for point in plan.get("points", []):
        fact = point.get("fact", "")
        # Look for numbers/stats like "42 percent"
        stats = re.findall(r'\d+\spercent|\d+%', fact)
        for s in stats:
            if s.lower() not in text.lower():
                findings.append(f"Missing/Mutated Stat: {s}")

    if findings:
        logger.error("FIDELITY FAIL: %s", ", ".join(findings))
        return False
    return True

def final_polish(text):
    """
    Final regex-based cleanup to fix tokenization hallucinations 
    and ensure structural integrity/spacing.
    """
    # Rule 2: NO EM-DASHES (Replace with comma)
    text = text.replace("—", ",").replace("--", ",")
    
    # Fix common tokenization errors from high temperature
    text = re.sub(r"Thisn't", "This isn't", text, flags=re.IGNORECASE)
    text = re.sub(r"thatn't", "that isn't", text, flags=re.IGNORECASE)
    text = re.sub(r"itn't", "it isn't", text, flags=re.IGNORECASE)
    text = re.sub(r"isnt", "isn't", text, flags=re.IGNORECASE)
    text = re.sub(r"dont", "don't", text, flags=re.IGNORECASE)
    
    # Normalize horizontal whitespace but PRESERVE double-line breaks for paragraphs
    # 1. Normalize spaces/tabs
    text = re.sub(r"[ \t]+", " ", text)
    # 2. Normalize more than two newlines to double newlines
    text = re.sub(r"\n{3,}", "\n\n", text)
    
    return text.strip()

if __name__ == "__main__":
    # Local CLI test
    import sys
    print("--- AI Text Humanizer CLI (Re-Authoring Mode) ---")
    
    # Check for token
    token = os.getenv("HF_TOKEN")
    if not token and len(sys.argv) > 1:
        token = sys.argv[1]
    
    if not token:
        print("Set HF_TOKEN env var or pass as arg.")
        sys.exit(1)

    print("Enter AI text (Ctrl+D to finish):")
    input_text = sys.stdin.read().strip()
    
    if input_text:
        plan, draft, final, ver = process_text(input_text, token)
        print("\n--- PLAN ---\n", plan)
        print("\n--- BLIND DRAFT ---\n", draft)
        print("\n--- HUMANIZED ---\n", final)
        print("\n--- VERIFICATION ---\n", ver)