Spaces:
Sleeping
Sleeping
position
Browse files
README.md
CHANGED
|
@@ -7,7 +7,7 @@ sdk: gradio
|
|
| 7 |
sdk_version: 5.25.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
short_description: Inserts [
|
| 11 |
---
|
| 12 |
|
| 13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 7 |
sdk_version: 5.25.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
short_description: Inserts [POSITION_###] tags in large texts
|
| 11 |
---
|
| 12 |
|
| 13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
|
@@ -3,6 +3,7 @@ import gradio as gr
|
|
| 3 |
|
| 4 |
|
| 5 |
def insert_points(text):
|
|
|
|
| 6 |
counter = 1
|
| 7 |
|
| 8 |
# -----------------------------
|
|
@@ -25,22 +26,26 @@ def insert_points(text):
|
|
| 25 |
text = '\n' + text
|
| 26 |
|
| 27 |
# -----------------------------
|
| 28 |
-
# 2)
|
| 29 |
-
# Process the text in a single pass to avoid duplicate tags
|
| 30 |
# -----------------------------
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# Split the text into segments (paragraphs and headings)
|
| 33 |
segments = re.split(r'(\n#+\s+.+?(?=\n|$)|\n)', text)
|
| 34 |
segments = [s for s in segments if s] # Remove empty segments
|
| 35 |
|
| 36 |
-
|
| 37 |
-
previous_was_tag = False
|
| 38 |
|
|
|
|
|
|
|
|
|
|
| 39 |
for segment in segments:
|
| 40 |
# If this is a newline, add a tag (but not after another tag)
|
| 41 |
if segment == '\n':
|
| 42 |
if not previous_was_tag:
|
| 43 |
-
result.append(f"\n[
|
| 44 |
counter += 1
|
| 45 |
previous_was_tag = True
|
| 46 |
else:
|
|
@@ -49,7 +54,7 @@ def insert_points(text):
|
|
| 49 |
# If this is a heading, add a tag before it
|
| 50 |
elif segment.startswith('\n#'):
|
| 51 |
if not previous_was_tag:
|
| 52 |
-
result.append(f"\n[
|
| 53 |
counter += 1
|
| 54 |
previous_was_tag = True
|
| 55 |
result.append(segment)
|
|
@@ -63,22 +68,15 @@ def insert_points(text):
|
|
| 63 |
# Join all segments back together
|
| 64 |
text = ''.join(result)
|
| 65 |
|
| 66 |
-
# -----------------------------
|
| 67 |
-
# 3) ADD TAG AT THE BEGINNING IF NEEDED
|
| 68 |
-
# -----------------------------
|
| 69 |
-
if not text.startswith('[INSERT_POINT_'):
|
| 70 |
-
text = f"[INSERT_POINT_{counter:03d}]\n" + text
|
| 71 |
-
counter += 1
|
| 72 |
-
|
| 73 |
# -----------------------------
|
| 74 |
# 4) CLEAN UP: Remove excess newlines
|
| 75 |
# -----------------------------
|
| 76 |
# Remove extra blank lines before tags
|
| 77 |
-
text = re.sub(r'\n\n+(\[
|
| 78 |
# Remove extra blank lines at the beginning
|
| 79 |
text = re.sub(r'^\n+', '', text)
|
| 80 |
# Ensure no consecutive tags
|
| 81 |
-
text = re.sub(r'(\[
|
| 82 |
|
| 83 |
return text
|
| 84 |
|
|
@@ -96,11 +94,8 @@ demo = gr.Interface(
|
|
| 96 |
outputs=gr.Textbox(label="Processed Text with Tags"),
|
| 97 |
title="Insert Point Tagger",
|
| 98 |
description=(
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
"2) Places each tag on its own line.\n"
|
| 102 |
-
"3) Ensures consistent, sequential numbering (001, 002, etc.).\n"
|
| 103 |
-
"4) Avoids consecutive tags - never two tags in a row."
|
| 104 |
),
|
| 105 |
)
|
| 106 |
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def insert_points(text):
|
| 6 |
+
# Start the counter at 1
|
| 7 |
counter = 1
|
| 8 |
|
| 9 |
# -----------------------------
|
|
|
|
| 26 |
text = '\n' + text
|
| 27 |
|
| 28 |
# -----------------------------
|
| 29 |
+
# 2) PREPARE FOR PROCESSING
|
|
|
|
| 30 |
# -----------------------------
|
| 31 |
+
# First tag should be 001 and come at the beginning
|
| 32 |
+
result = [f"[POSITION_{counter:03d}]\n"]
|
| 33 |
+
counter += 1
|
| 34 |
|
| 35 |
# Split the text into segments (paragraphs and headings)
|
| 36 |
segments = re.split(r'(\n#+\s+.+?(?=\n|$)|\n)', text)
|
| 37 |
segments = [s for s in segments if s] # Remove empty segments
|
| 38 |
|
| 39 |
+
previous_was_tag = True # Since we just added the first tag
|
|
|
|
| 40 |
|
| 41 |
+
# -----------------------------
|
| 42 |
+
# 3) PROCESS SEGMENTS
|
| 43 |
+
# -----------------------------
|
| 44 |
for segment in segments:
|
| 45 |
# If this is a newline, add a tag (but not after another tag)
|
| 46 |
if segment == '\n':
|
| 47 |
if not previous_was_tag:
|
| 48 |
+
result.append(f"\n[POSITION_{counter:03d}]\n")
|
| 49 |
counter += 1
|
| 50 |
previous_was_tag = True
|
| 51 |
else:
|
|
|
|
| 54 |
# If this is a heading, add a tag before it
|
| 55 |
elif segment.startswith('\n#'):
|
| 56 |
if not previous_was_tag:
|
| 57 |
+
result.append(f"\n[POSITION_{counter:03d}]")
|
| 58 |
counter += 1
|
| 59 |
previous_was_tag = True
|
| 60 |
result.append(segment)
|
|
|
|
| 68 |
# Join all segments back together
|
| 69 |
text = ''.join(result)
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
# -----------------------------
|
| 72 |
# 4) CLEAN UP: Remove excess newlines
|
| 73 |
# -----------------------------
|
| 74 |
# Remove extra blank lines before tags
|
| 75 |
+
text = re.sub(r'\n\n+(\[POSITION_)', r'\n\1', text)
|
| 76 |
# Remove extra blank lines at the beginning
|
| 77 |
text = re.sub(r'^\n+', '', text)
|
| 78 |
# Ensure no consecutive tags
|
| 79 |
+
text = re.sub(r'(\[POSITION_\d{3}]\n)\s*\[POSITION_\d{3}]', r'\1', text)
|
| 80 |
|
| 81 |
return text
|
| 82 |
|
|
|
|
| 94 |
outputs=gr.Textbox(label="Processed Text with Tags"),
|
| 95 |
title="Insert Point Tagger",
|
| 96 |
description=(
|
| 97 |
+
"This processor inserts numbered tags between paragraphs and before #-headers"
|
| 98 |
+
|
|
|
|
|
|
|
|
|
|
| 99 |
),
|
| 100 |
)
|
| 101 |
|