Spaces:
Sleeping
Sleeping
File size: 3,529 Bytes
1311701 d85300a 73a9e74 d4f7222 ed14152 1311701 d85300a d4f7222 8f04db2 d4f7222 8f04db2 d4f7222 8f04db2 ed14152 d4f7222 ed14152 d4f7222 225d229 d4f7222 ed14152 225d229 ed14152 225d229 ed14152 225d229 ed14152 225d229 d4f7222 225d229 c6710ef 8f04db2 a21041f 8f04db2 a21041f 8f04db2 a21041f 225d229 ed14152 8f04db2 d4f7222 d85300a 1311701 d4f7222 1311701 a21041f d4f7222 1311701 a21041f 1311701 d85300a c6710ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import re
import gradio as gr
def insert_points(text):
# Start the counter at 1
counter = 1
# -----------------------------
# 0) NORMALIZE LINE ENDINGS AND REMOVE EXCESS WHITESPACE
# -----------------------------
text = text.strip()
# Replace Windows-style line endings
text = text.replace('\r\n', '\n')
# Collapse multiple blank lines into one
text = re.sub(r'\n\s*\n+', '\n', text)
# -----------------------------
# 1) FIRST PASS: INSERT NEWLINES BEFORE HEADINGS
# This separates headings that might be in running text
# -----------------------------
text = re.sub(r'(\s+)(#+\s+)', r'\1\n\2', text)
# Special case for heading at the start without newline
if text.startswith('#'):
text = '\n' + text
# -----------------------------
# 2) PREPARE FOR PROCESSING
# -----------------------------
# First tag should be 001 and come at the beginning
result = [f"[POSITION_{counter:03d}]\n"]
counter += 1
# Split the text into segments (paragraphs and headings)
segments = re.split(r'(\n#+\s+.+?(?=\n|$)|\n)', text)
segments = [s for s in segments if s] # Remove empty segments
previous_was_tag = True # Since we just added the first tag
# -----------------------------
# 3) PROCESS SEGMENTS
# -----------------------------
for segment in segments:
# If this is a newline, add a tag (but not after another tag)
if segment == '\n':
if not previous_was_tag:
result.append(f"\n[POSITION_{counter:03d}]\n")
counter += 1
previous_was_tag = True
else:
result.append('\n') # Just add the newline without a tag
# If this is a heading, add a tag before it
elif segment.startswith('\n#'):
if not previous_was_tag:
result.append(f"\n[POSITION_{counter:03d}]")
counter += 1
previous_was_tag = True
result.append(segment)
previous_was_tag = False
# Regular text segment
else:
result.append(segment)
previous_was_tag = False
# Join all segments back together
text = ''.join(result)
# -----------------------------
# 4) CLEAN UP: Normalize spacing around tags
# -----------------------------
# Ensure exactly one newline before each tag
text = re.sub(r'([^\n])\[POSITION_', r'\1\n[POSITION_', text)
text = re.sub(r'\n+(\[POSITION_)', r'\n\1', text)
# Ensure exactly one newline after each tag
text = re.sub(r'(\[POSITION_\d{3}])([^\n])', r'\1\n\2', text)
text = re.sub(r'(\[POSITION_\d{3}])\n+', r'\1\n', text)
# Remove any newlines at the very beginning of the text
text = re.sub(r'^\n+', '', text)
# Ensure no consecutive tags
text = re.sub(r'(\[POSITION_\d{3}]\n)\s*\[POSITION_\d{3}]', r'\1', text)
return text
# -----------------------------
# GRADIO INTERFACE
# -----------------------------
demo = gr.Interface(
fn=insert_points,
inputs=gr.Textbox(
lines=10,
placeholder="Paste your text here...",
label="Your Input Text"
),
outputs=gr.Textbox(
label="Processed Text with Tags",
show_copy_button=True # Enable copy button
),
title="Insert Point Tagger",
description=(
"This processor inserts numbered tags between paragraphs and before #-headers"
),
)
if __name__ == "__main__":
demo.launch() |