BtB-ExpC commited on
Commit
ed14152
·
1 Parent(s): 225d229
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +15 -20
README.md CHANGED
@@ -7,7 +7,7 @@ sdk: gradio
7
  sdk_version: 5.25.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: Inserts [INSERT_POINT_###] tags in large texts
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
7
  sdk_version: 5.25.0
8
  app_file: app.py
9
  pinned: false
10
+ short_description: Inserts [POSITION_###] tags in large texts
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
 
4
 
5
  def insert_points(text):
 
6
  counter = 1
7
 
8
  # -----------------------------
@@ -25,22 +26,26 @@ def insert_points(text):
25
  text = '\n' + text
26
 
27
  # -----------------------------
28
- # 2) SPLIT TEXT INTO SEGMENTS AND TAG
29
- # Process the text in a single pass to avoid duplicate tags
30
  # -----------------------------
 
 
 
31
 
32
  # Split the text into segments (paragraphs and headings)
33
  segments = re.split(r'(\n#+\s+.+?(?=\n|$)|\n)', text)
34
  segments = [s for s in segments if s] # Remove empty segments
35
 
36
- result = []
37
- previous_was_tag = False
38
 
 
 
 
39
  for segment in segments:
40
  # If this is a newline, add a tag (but not after another tag)
41
  if segment == '\n':
42
  if not previous_was_tag:
43
- result.append(f"\n[INSERT_POINT_{counter:03d}]\n")
44
  counter += 1
45
  previous_was_tag = True
46
  else:
@@ -49,7 +54,7 @@ def insert_points(text):
49
  # If this is a heading, add a tag before it
50
  elif segment.startswith('\n#'):
51
  if not previous_was_tag:
52
- result.append(f"\n[INSERT_POINT_{counter:03d}]")
53
  counter += 1
54
  previous_was_tag = True
55
  result.append(segment)
@@ -63,22 +68,15 @@ def insert_points(text):
63
  # Join all segments back together
64
  text = ''.join(result)
65
 
66
- # -----------------------------
67
- # 3) ADD TAG AT THE BEGINNING IF NEEDED
68
- # -----------------------------
69
- if not text.startswith('[INSERT_POINT_'):
70
- text = f"[INSERT_POINT_{counter:03d}]\n" + text
71
- counter += 1
72
-
73
  # -----------------------------
74
  # 4) CLEAN UP: Remove excess newlines
75
  # -----------------------------
76
  # Remove extra blank lines before tags
77
- text = re.sub(r'\n\n+(\[INSERT_POINT_)', r'\n\1', text)
78
  # Remove extra blank lines at the beginning
79
  text = re.sub(r'^\n+', '', text)
80
  # Ensure no consecutive tags
81
- text = re.sub(r'(\[INSERT_POINT_\d{3}]\n)\s*\[INSERT_POINT_\d{3}]', r'\1', text)
82
 
83
  return text
84
 
@@ -96,11 +94,8 @@ demo = gr.Interface(
96
  outputs=gr.Textbox(label="Processed Text with Tags"),
97
  title="Insert Point Tagger",
98
  description=(
99
- "This processor:\n"
100
- "1) Tags headings and paragraph breaks with sequential numbers.\n"
101
- "2) Places each tag on its own line.\n"
102
- "3) Ensures consistent, sequential numbering (001, 002, etc.).\n"
103
- "4) Avoids consecutive tags - never two tags in a row."
104
  ),
105
  )
106
 
 
3
 
4
 
5
  def insert_points(text):
6
+ # Start the counter at 1
7
  counter = 1
8
 
9
  # -----------------------------
 
26
  text = '\n' + text
27
 
28
  # -----------------------------
29
+ # 2) PREPARE FOR PROCESSING
 
30
  # -----------------------------
31
+ # First tag should be 001 and come at the beginning
32
+ result = [f"[POSITION_{counter:03d}]\n"]
33
+ counter += 1
34
 
35
  # Split the text into segments (paragraphs and headings)
36
  segments = re.split(r'(\n#+\s+.+?(?=\n|$)|\n)', text)
37
  segments = [s for s in segments if s] # Remove empty segments
38
 
39
+ previous_was_tag = True # Since we just added the first tag
 
40
 
41
+ # -----------------------------
42
+ # 3) PROCESS SEGMENTS
43
+ # -----------------------------
44
  for segment in segments:
45
  # If this is a newline, add a tag (but not after another tag)
46
  if segment == '\n':
47
  if not previous_was_tag:
48
+ result.append(f"\n[POSITION_{counter:03d}]\n")
49
  counter += 1
50
  previous_was_tag = True
51
  else:
 
54
  # If this is a heading, add a tag before it
55
  elif segment.startswith('\n#'):
56
  if not previous_was_tag:
57
+ result.append(f"\n[POSITION_{counter:03d}]")
58
  counter += 1
59
  previous_was_tag = True
60
  result.append(segment)
 
68
  # Join all segments back together
69
  text = ''.join(result)
70
 
 
 
 
 
 
 
 
71
  # -----------------------------
72
  # 4) CLEAN UP: Remove excess newlines
73
  # -----------------------------
74
  # Remove extra blank lines before tags
75
+ text = re.sub(r'\n\n+(\[POSITION_)', r'\n\1', text)
76
  # Remove extra blank lines at the beginning
77
  text = re.sub(r'^\n+', '', text)
78
  # Ensure no consecutive tags
79
+ text = re.sub(r'(\[POSITION_\d{3}]\n)\s*\[POSITION_\d{3}]', r'\1', text)
80
 
81
  return text
82
 
 
94
  outputs=gr.Textbox(label="Processed Text with Tags"),
95
  title="Insert Point Tagger",
96
  description=(
97
+ "This processor inserts numbered tags between paragraphs and before #-headers"
98
+
 
 
 
99
  ),
100
  )
101