Spaces:

bughead
/

humanzise-api

Running

File size: 2,020 Bytes

325e5a1

# utils/pdf_utils.py
import fitz
from io import BytesIO
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt', quiet=True)

def extract_text_from_pdf(pdf_bytes):
    """Extract text from all pages of a PDF."""
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    all_text = ""
    for page in doc:
        all_text += page.get_text("text") + "\n"
    doc.close()
    return all_text

def word_count(text):
    return len(word_tokenize(text))

def generate_annotated_pdf(pdf_bytes, classification_map):
    """Generate an annotated PDF with color-coded highlights for AI text."""
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    legend_text = (
        "Color Legend:\n"
        "• Red: AI-generated\n"
        "• Orange: AI-generated & AI-refined\n"
        "• Light Blue: Human-written & AI-refined\n\n"
        "Note: Sentences classified as 'Human-written' are not highlighted."
    )
    legend_page = doc.new_page(pno=0)
    legend_page.insert_text((72, 72), legend_text, fontsize=14, fontname="helv")

    def hex_to_rgb_float(hex_color):
        hex_color = hex_color.lstrip('#')
        r = int(hex_color[0:2], 16) / 255.0
        g = int(hex_color[2:4], 16) / 255.0
        b = int(hex_color[4:6], 16) / 255.0
        return (r, g, b)

    COLOR_MAPPING = {
        "AI-generated": "#ffcccc",
        "AI-generated & AI-refined": "#ffe5cc",
        "Human-written & AI-refined": "#e6f2ff"
    }

    for sentence, label in classification_map.items():
        if label == "Human-written":
            continue
        color_hex = COLOR_MAPPING.get(label)
        if not color_hex:
            continue
        color = hex_to_rgb_float(color_hex)
        for page in doc:
            rects = page.search_for(sentence)
            for rect in rects:
                annot = page.add_highlight_annot(rect)
                annot.set_colors(stroke=color)
                annot.update()

    out_bytes = doc.write()
    doc.close()
    return BytesIO(out_bytes)