File size: 3,529 Bytes
1311701
d85300a
 
73a9e74
d4f7222
ed14152
1311701
d85300a
d4f7222
8f04db2
d4f7222
8f04db2
 
 
 
d4f7222
 
 
8f04db2
 
 
 
 
 
 
 
 
 
ed14152
d4f7222
ed14152
 
 
d4f7222
225d229
 
 
d4f7222
ed14152
225d229
ed14152
 
 
225d229
 
 
 
ed14152
225d229
 
 
 
 
 
 
 
ed14152
225d229
 
 
 
 
 
 
 
 
d4f7222
225d229
 
c6710ef
8f04db2
a21041f
8f04db2
a21041f
 
 
 
 
 
 
 
 
8f04db2
a21041f
225d229
ed14152
8f04db2
d4f7222
 
 
 
 
 
d85300a
 
1311701
 
d4f7222
1311701
 
a21041f
 
 
 
d4f7222
1311701
a21041f
1311701
d85300a
 
 
c6710ef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import re
import gradio as gr


def insert_points(text):
    # Start the counter at 1
    counter = 1

    # -----------------------------
    # 0) NORMALIZE LINE ENDINGS AND REMOVE EXCESS WHITESPACE
    # -----------------------------
    text = text.strip()
    # Replace Windows-style line endings
    text = text.replace('\r\n', '\n')
    # Collapse multiple blank lines into one
    text = re.sub(r'\n\s*\n+', '\n', text)

    # -----------------------------
    # 1) FIRST PASS: INSERT NEWLINES BEFORE HEADINGS
    #    This separates headings that might be in running text
    # -----------------------------
    text = re.sub(r'(\s+)(#+\s+)', r'\1\n\2', text)

    # Special case for heading at the start without newline
    if text.startswith('#'):
        text = '\n' + text

    # -----------------------------
    # 2) PREPARE FOR PROCESSING
    # -----------------------------
    # First tag should be 001 and come at the beginning
    result = [f"[POSITION_{counter:03d}]\n"]
    counter += 1

    # Split the text into segments (paragraphs and headings)
    segments = re.split(r'(\n#+\s+.+?(?=\n|$)|\n)', text)
    segments = [s for s in segments if s]  # Remove empty segments

    previous_was_tag = True  # Since we just added the first tag

    # -----------------------------
    # 3) PROCESS SEGMENTS
    # -----------------------------
    for segment in segments:
        # If this is a newline, add a tag (but not after another tag)
        if segment == '\n':
            if not previous_was_tag:
                result.append(f"\n[POSITION_{counter:03d}]\n")
                counter += 1
                previous_was_tag = True
            else:
                result.append('\n')  # Just add the newline without a tag

        # If this is a heading, add a tag before it
        elif segment.startswith('\n#'):
            if not previous_was_tag:
                result.append(f"\n[POSITION_{counter:03d}]")
                counter += 1
                previous_was_tag = True
            result.append(segment)
            previous_was_tag = False

        # Regular text segment
        else:
            result.append(segment)
            previous_was_tag = False

    # Join all segments back together
    text = ''.join(result)

    # -----------------------------
    # 4) CLEAN UP: Normalize spacing around tags
    # -----------------------------
    # Ensure exactly one newline before each tag
    text = re.sub(r'([^\n])\[POSITION_', r'\1\n[POSITION_', text)
    text = re.sub(r'\n+(\[POSITION_)', r'\n\1', text)

    # Ensure exactly one newline after each tag
    text = re.sub(r'(\[POSITION_\d{3}])([^\n])', r'\1\n\2', text)
    text = re.sub(r'(\[POSITION_\d{3}])\n+', r'\1\n', text)

    # Remove any newlines at the very beginning of the text
    text = re.sub(r'^\n+', '', text)

    # Ensure no consecutive tags
    text = re.sub(r'(\[POSITION_\d{3}]\n)\s*\[POSITION_\d{3}]', r'\1', text)

    return text


# -----------------------------
#  GRADIO INTERFACE
# -----------------------------
demo = gr.Interface(
    fn=insert_points,
    inputs=gr.Textbox(
        lines=10,
        placeholder="Paste your text here...",
        label="Your Input Text"
    ),
    outputs=gr.Textbox(
        label="Processed Text with Tags",
        show_copy_button=True  # Enable copy button
    ),
    title="Insert Point Tagger",
    description=(
        "This processor inserts numbered tags between paragraphs and before #-headers"
    ),
)

if __name__ == "__main__":
    demo.launch()