BtB-ExpC's picture
Normalizing whitespace & adding copy button
a21041f
import re
import gradio as gr
def insert_points(text):
# Start the counter at 1
counter = 1
# -----------------------------
# 0) NORMALIZE LINE ENDINGS AND REMOVE EXCESS WHITESPACE
# -----------------------------
text = text.strip()
# Replace Windows-style line endings
text = text.replace('\r\n', '\n')
# Collapse multiple blank lines into one
text = re.sub(r'\n\s*\n+', '\n', text)
# -----------------------------
# 1) FIRST PASS: INSERT NEWLINES BEFORE HEADINGS
# This separates headings that might be in running text
# -----------------------------
text = re.sub(r'(\s+)(#+\s+)', r'\1\n\2', text)
# Special case for heading at the start without newline
if text.startswith('#'):
text = '\n' + text
# -----------------------------
# 2) PREPARE FOR PROCESSING
# -----------------------------
# First tag should be 001 and come at the beginning
result = [f"[POSITION_{counter:03d}]\n"]
counter += 1
# Split the text into segments (paragraphs and headings)
segments = re.split(r'(\n#+\s+.+?(?=\n|$)|\n)', text)
segments = [s for s in segments if s] # Remove empty segments
previous_was_tag = True # Since we just added the first tag
# -----------------------------
# 3) PROCESS SEGMENTS
# -----------------------------
for segment in segments:
# If this is a newline, add a tag (but not after another tag)
if segment == '\n':
if not previous_was_tag:
result.append(f"\n[POSITION_{counter:03d}]\n")
counter += 1
previous_was_tag = True
else:
result.append('\n') # Just add the newline without a tag
# If this is a heading, add a tag before it
elif segment.startswith('\n#'):
if not previous_was_tag:
result.append(f"\n[POSITION_{counter:03d}]")
counter += 1
previous_was_tag = True
result.append(segment)
previous_was_tag = False
# Regular text segment
else:
result.append(segment)
previous_was_tag = False
# Join all segments back together
text = ''.join(result)
# -----------------------------
# 4) CLEAN UP: Normalize spacing around tags
# -----------------------------
# Ensure exactly one newline before each tag
text = re.sub(r'([^\n])\[POSITION_', r'\1\n[POSITION_', text)
text = re.sub(r'\n+(\[POSITION_)', r'\n\1', text)
# Ensure exactly one newline after each tag
text = re.sub(r'(\[POSITION_\d{3}])([^\n])', r'\1\n\2', text)
text = re.sub(r'(\[POSITION_\d{3}])\n+', r'\1\n', text)
# Remove any newlines at the very beginning of the text
text = re.sub(r'^\n+', '', text)
# Ensure no consecutive tags
text = re.sub(r'(\[POSITION_\d{3}]\n)\s*\[POSITION_\d{3}]', r'\1', text)
return text
# -----------------------------
# GRADIO INTERFACE
# -----------------------------
demo = gr.Interface(
fn=insert_points,
inputs=gr.Textbox(
lines=10,
placeholder="Paste your text here...",
label="Your Input Text"
),
outputs=gr.Textbox(
label="Processed Text with Tags",
show_copy_button=True # Enable copy button
),
title="Insert Point Tagger",
description=(
"This processor inserts numbered tags between paragraphs and before #-headers"
),
)
if __name__ == "__main__":
demo.launch()