Spaces:
Running
Running
File size: 5,080 Bytes
d1abcca 7e28f04 d1abcca c872b36 7e28f04 d1abcca 7e28f04 d1abcca 17fb2e6 d1abcca 8e8db39 d1abcca 7e28f04 d1abcca 8e8db39 d1abcca 7e28f04 d1abcca 7e28f04 8e8db39 7e28f04 d1abcca 8e8db39 d1abcca 8e8db39 d1abcca 8e8db39 7e28f04 8e8db39 c872b36 7e28f04 d1abcca 7e28f04 d1abcca 8e8db39 c872b36 04d5068 c872b36 04d5068 c872b36 04d5068 c872b36 04d5068 c872b36 7e28f04 d1abcca 7e28f04 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import os
import json
import logging
import re
from agents.planner import Planner
from agents.writer import Writer
from agents.humanizer import Humanizer
from agents.verifier import Verifier
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
def process_text(text, hf_token=None, intensity=1.0):
"""
Main orchestration function for the Precision Humanization Pipeline.
1. Planner: Extracts exact statistics and themes.
2. Writer: Rewrites using Source Anchor + Staccato Rhythm.
3. Humanizer: Refines with strict Fidelity Pillars.
4. FidelityGuard: Blocks any output that mutates or misses source facts.
"""
token = hf_token or os.getenv("HF_TOKEN")
if not token:
return "Error: HF_TOKEN not found.", "", "", ""
logger.info("Starting Precision Humanization Pipeline (Phase 15)...")
# Initialize Agents
planner = Planner(token)
writer = Writer(token)
humanizer = Humanizer(token)
verifier = Verifier(token)
try:
# Step 1: Semantic Planning
logger.info("Step 1: Planning (Exact Mirroring)...")
plan = planner.plan(text)
plan_str = json.dumps(plan, indent=2)
# Step 2: Source-Guided Writing
logger.info("Step 2: Guided Writing (Source Anchor)...")
draft = writer.write(plan, text, intensity=intensity)
# Step 3: Humanization (Grounded Mode)
logger.info("Step 3: Humanizing (Fidelity Pilar)...")
humanized_text = humanizer.humanize(draft, text, intensity=intensity)
# Step 4: Fidelity Guard (Hard Fact check)
logger.info("Step 4: Fidelity Guard (Fact Lock)...")
if not fidelity_guard(humanized_text, plan):
logger.warning("Fidelity check failed! Draft likely mutated facts.")
# We don't fail the whole app, but we log the warning for the developer.
# In production, we might retry or return a 'Fidelity Error'.
# Step 5: Verification
logger.info("Step 5: Verifying...")
verification = verifier.verify(humanized_text)
# Step 6: Final Polish
humanized_text = final_polish(humanized_text)
ver_str = f"Label: {verification['label']}\nConfidence: {verification['confidence']:.1%}"
return plan_str, draft, humanized_text, ver_str
except Exception as e:
logger.error("Pipeline failed: %s", e)
return f"Error: {str(e)}", "", "", ""
def fidelity_guard(text, plan):
"""
Returns True if the core facts (stats, names) from the plan are found in the text.
Returns False if mutations or missing data are detected.
"""
findings = []
# Check for specific names and stats mentioned in plan
entities = plan.get("entities", [])
for ent in entities:
if ent.lower() not in text.lower():
findings.append(f"Missing Entity: {ent}")
for point in plan.get("points", []):
fact = point.get("fact", "")
# Look for numbers/stats like "42 percent"
stats = re.findall(r'\d+\spercent|\d+%', fact)
for s in stats:
if s.lower() not in text.lower():
findings.append(f"Missing/Mutated Stat: {s}")
if findings:
logger.error("FIDELITY FAIL: %s", ", ".join(findings))
return False
return True
def final_polish(text):
"""
Final regex-based cleanup to fix tokenization hallucinations
and ensure structural integrity/spacing.
"""
# Rule 2: NO EM-DASHES (Replace with comma)
text = text.replace("—", ",").replace("--", ",")
# Fix common tokenization errors from high temperature
text = re.sub(r"Thisn't", "This isn't", text, flags=re.IGNORECASE)
text = re.sub(r"thatn't", "that isn't", text, flags=re.IGNORECASE)
text = re.sub(r"itn't", "it isn't", text, flags=re.IGNORECASE)
text = re.sub(r"isnt", "isn't", text, flags=re.IGNORECASE)
text = re.sub(r"dont", "don't", text, flags=re.IGNORECASE)
# Normalize horizontal whitespace but PRESERVE double-line breaks for paragraphs
# 1. Normalize spaces/tabs
text = re.sub(r"[ \t]+", " ", text)
# 2. Normalize more than two newlines to double newlines
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
if __name__ == "__main__":
# Local CLI test
import sys
print("--- AI Text Humanizer CLI (Re-Authoring Mode) ---")
# Check for token
token = os.getenv("HF_TOKEN")
if not token and len(sys.argv) > 1:
token = sys.argv[1]
if not token:
print("Set HF_TOKEN env var or pass as arg.")
sys.exit(1)
print("Enter AI text (Ctrl+D to finish):")
input_text = sys.stdin.read().strip()
if input_text:
plan, draft, final, ver = process_text(input_text, token)
print("\n--- PLAN ---\n", plan)
print("\n--- BLIND DRAFT ---\n", draft)
print("\n--- HUMANIZED ---\n", final)
print("\n--- VERIFICATION ---\n", ver)
|