Spaces:

AJAYKASU
/

AI_Humanizer

Running

File size: 5,080 Bytes

d1abcca
 
7e28f04
d1abcca
c872b36
7e28f04
 
d1abcca
 
 
7e28f04
 
 
d1abcca
17fb2e6
d1abcca
8e8db39
 
 
 
 
d1abcca
7e28f04
 
 
d1abcca
8e8db39
d1abcca
7e28f04
 
 
 
 
d1abcca
7e28f04
8e8db39
 
7e28f04
 
d1abcca
8e8db39
 
 
d1abcca
8e8db39
 
 
d1abcca
8e8db39
 
 
 
 
 
 
 
 
7e28f04
 
8e8db39
c872b36
 
7e28f04
 
d1abcca
7e28f04
 
 
d1abcca
8e8db39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c872b36
 
 
04d5068
c872b36
04d5068
 
 
 
c872b36
 
 
04d5068
 
 
 
 
 
 
 
c872b36
04d5068
c872b36
7e28f04
 
 
 
 
 
 
 
 
 
 
 
d1abcca
 
7e28f04


import os
import json
import logging
import re
from agents.planner import Planner
from agents.writer import Writer
from agents.humanizer import Humanizer
from agents.verifier import Verifier

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

def process_text(text, hf_token=None, intensity=1.0):
    """
    Main orchestration function for the Precision Humanization Pipeline.
    1. Planner: Extracts exact statistics and themes.
    2. Writer: Rewrites using Source Anchor + Staccato Rhythm.
    3. Humanizer: Refines with strict Fidelity Pillars.
    4. FidelityGuard: Blocks any output that mutates or misses source facts.
    """
    token = hf_token or os.getenv("HF_TOKEN")
    if not token:
        return "Error: HF_TOKEN not found.", "", "", ""

    logger.info("Starting Precision Humanization Pipeline (Phase 15)...")

    # Initialize Agents
    planner = Planner(token)
    writer = Writer(token)
    humanizer = Humanizer(token)
    verifier = Verifier(token)

    try:
        # Step 1: Semantic Planning
        logger.info("Step 1: Planning (Exact Mirroring)...")
        plan = planner.plan(text)
        plan_str = json.dumps(plan, indent=2)

        # Step 2: Source-Guided Writing
        logger.info("Step 2: Guided Writing (Source Anchor)...")
        draft = writer.write(plan, text, intensity=intensity)

        # Step 3: Humanization (Grounded Mode)
        logger.info("Step 3: Humanizing (Fidelity Pilar)...")
        humanized_text = humanizer.humanize(draft, text, intensity=intensity)

        # Step 4: Fidelity Guard (Hard Fact check)
        logger.info("Step 4: Fidelity Guard (Fact Lock)...")
        if not fidelity_guard(humanized_text, plan):
            logger.warning("Fidelity check failed! Draft likely mutated facts.")
            # We don't fail the whole app, but we log the warning for the developer.
            # In production, we might retry or return a 'Fidelity Error'.

        # Step 5: Verification
        logger.info("Step 5: Verifying...")
        verification = verifier.verify(humanized_text)
        
        # Step 6: Final Polish
        humanized_text = final_polish(humanized_text)
        
        ver_str = f"Label: {verification['label']}\nConfidence: {verification['confidence']:.1%}"
        return plan_str, draft, humanized_text, ver_str

    except Exception as e:
        logger.error("Pipeline failed: %s", e)
        return f"Error: {str(e)}", "", "", ""

def fidelity_guard(text, plan):
    """
    Returns True if the core facts (stats, names) from the plan are found in the text.
    Returns False if mutations or missing data are detected.
    """
    findings = []
    # Check for specific names and stats mentioned in plan
    entities = plan.get("entities", [])
    for ent in entities:
        if ent.lower() not in text.lower():
            findings.append(f"Missing Entity: {ent}")
            
    for point in plan.get("points", []):
        fact = point.get("fact", "")
        # Look for numbers/stats like "42 percent"
        stats = re.findall(r'\d+\spercent|\d+%', fact)
        for s in stats:
            if s.lower() not in text.lower():
                findings.append(f"Missing/Mutated Stat: {s}")

    if findings:
        logger.error("FIDELITY FAIL: %s", ", ".join(findings))
        return False
    return True

def final_polish(text):
    """
    Final regex-based cleanup to fix tokenization hallucinations 
    and ensure structural integrity/spacing.
    """
    # Rule 2: NO EM-DASHES (Replace with comma)
    text = text.replace("—", ",").replace("--", ",")
    
    # Fix common tokenization errors from high temperature
    text = re.sub(r"Thisn't", "This isn't", text, flags=re.IGNORECASE)
    text = re.sub(r"thatn't", "that isn't", text, flags=re.IGNORECASE)
    text = re.sub(r"itn't", "it isn't", text, flags=re.IGNORECASE)
    text = re.sub(r"isnt", "isn't", text, flags=re.IGNORECASE)
    text = re.sub(r"dont", "don't", text, flags=re.IGNORECASE)
    
    # Normalize horizontal whitespace but PRESERVE double-line breaks for paragraphs
    # 1. Normalize spaces/tabs
    text = re.sub(r"[ \t]+", " ", text)
    # 2. Normalize more than two newlines to double newlines
    text = re.sub(r"\n{3,}", "\n\n", text)
    
    return text.strip()

if __name__ == "__main__":
    # Local CLI test
    import sys
    print("--- AI Text Humanizer CLI (Re-Authoring Mode) ---")
    
    # Check for token
    token = os.getenv("HF_TOKEN")
    if not token and len(sys.argv) > 1:
        token = sys.argv[1]
    
    if not token:
        print("Set HF_TOKEN env var or pass as arg.")
        sys.exit(1)

    print("Enter AI text (Ctrl+D to finish):")
    input_text = sys.stdin.read().strip()
    
    if input_text:
        plan, draft, final, ver = process_text(input_text, token)
        print("\n--- PLAN ---\n", plan)
        print("\n--- BLIND DRAFT ---\n", draft)
        print("\n--- HUMANIZED ---\n", final)
        print("\n--- VERIFICATION ---\n", ver)