Spaces:
Running
Running
File size: 5,058 Bytes
325e5a1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | import re
from typing import Dict, Optional
from fastapi import FastAPI, File, HTTPException, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
from utils.ai_detection_utils import classify_text_hf
from utils.pdf_utils import extract_text_from_pdf
from utils.humanizer_core import (
count_sentences,
count_words,
extract_citations,
minimal_rewriting,
preserve_linebreaks_rewrite,
restore_citations,
)
DESCRIPTION = """
AI Text Humanizer & Detector API
Provides server-side access to the project's text humanization and AI-detection
pipelines. The API is consumed by the Next.js frontend in /web.
"""
tags_metadata = [
{"name": "humanize", "description": "Transform AI-generated text into human-like prose."},
{"name": "detect", "description": "Classify text as AI-generated or human-written."},
]
app = FastAPI(
title="AI Text Humanizer API",
version="0.3",
description=DESCRIPTION,
openapi_tags=tags_metadata,
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class HumanizeRequest(BaseModel):
text: str = Field(..., description="The input text to humanize. Must be non-empty.")
p_syn: Optional[float] = Field(0.2, ge=0.0, le=1.0)
p_trans: Optional[float] = Field(0.2, ge=0.0, le=1.0)
preserve_linebreaks: Optional[bool] = Field(True)
class HumanizeResponse(BaseModel):
humanized_text: str
orig_word_count: int
orig_sentence_count: int
new_word_count: int
new_sentence_count: int
words_added: int
sentences_added: int
class DetectRequest(BaseModel):
text: str = Field(..., description="The input text to analyze.")
class DetectResponse(BaseModel):
percentages: Dict[str, float]
classification: Dict[str, str]
ai_score: float
human_score: float
@app.get("/health", tags=["humanize"], summary="Health check")
def health():
return {"status": "ok"}
@app.post("/humanize", response_model=HumanizeResponse, tags=["humanize"])
def humanize(req: HumanizeRequest):
text = req.text or ""
if not text.strip():
raise HTTPException(status_code=400, detail="`text` must be a non-empty string")
orig_wc = count_words(text)
orig_sc = count_sentences(text)
no_refs_text, placeholders = extract_citations(text)
if req.preserve_linebreaks:
rewritten = preserve_linebreaks_rewrite(no_refs_text, p_syn=req.p_syn, p_trans=req.p_trans)
else:
rewritten = minimal_rewriting(no_refs_text, p_syn=req.p_syn, p_trans=req.p_trans)
final_text = restore_citations(rewritten, placeholders)
final_text = re.sub(r"[ \t]+([.,;:!?])", r"\1", final_text)
final_text = re.sub(r"(\()[ \t]+", r"\1", final_text)
final_text = re.sub(r"[ \t]+(\))", r"\1", final_text)
final_text = re.sub(r"[ \t]{2,}", " ", final_text)
final_text = re.sub(r"``\s*(.+?)\s*''", r'"\1"', final_text)
new_wc = count_words(final_text)
new_sc = count_sentences(final_text)
return {
"humanized_text": final_text,
"orig_word_count": orig_wc,
"orig_sentence_count": orig_sc,
"new_word_count": new_wc,
"new_sentence_count": new_sc,
"words_added": new_wc - orig_wc,
"sentences_added": new_sc - orig_sc,
}
@app.post("/extract-file", tags=["humanize"], summary="Extract text from uploaded file")
async def extract_file(file: UploadFile = File(...)):
"""Accept a PDF, TXT or MD file and return its plain-text contents."""
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
content = await file.read()
name = file.filename.lower()
try:
if name.endswith(".pdf"):
text = extract_text_from_pdf(content)
elif name.endswith((".txt", ".md")):
text = content.decode("utf-8", errors="ignore")
else:
raise HTTPException(
status_code=400,
detail="Unsupported file type. Use .pdf, .txt, or .md",
)
except HTTPException:
raise
except Exception as exc:
raise HTTPException(status_code=500, detail=f"Failed to extract: {exc}")
return {"text": text, "filename": file.filename}
@app.post("/detect", response_model=DetectResponse, tags=["detect"])
def detect(req: DetectRequest):
text = req.text or ""
if not text.strip():
raise HTTPException(status_code=400, detail="`text` must be a non-empty string")
classification_map, percentages, mean_ai_prob = classify_text_hf(text)
# Use the raw mean probability as the headline score — it's a more honest
# signal than bucket-counting (which collapses to 0 for borderline text).
ai_score = round(mean_ai_prob * 100, 2)
human_score = round(100 - ai_score, 2)
return {
"percentages": percentages,
"classification": classification_map,
"ai_score": ai_score,
"human_score": human_score,
}
|