BtB-ExpC commited on
Commit
8f04db2
·
1 Parent(s): c6710ef
Files changed (1) hide show
  1. app.py +43 -31
app.py CHANGED
@@ -6,55 +6,66 @@ def insert_points(text):
6
  counter = 1
7
 
8
  # -----------------------------
9
- # 1) COLLAPSE MULTIPLE BLANK LINES INTO ONE
10
- # e.g. 2+ newlines in a row => single newline
11
  # -----------------------------
 
 
 
 
12
  text = re.sub(r'\n\s*\n+', '\n', text)
13
 
14
  # -----------------------------
15
- # 2) PROCESS HEADINGS FIRST (before dealing with newlines)
16
- # This ensures headings get properly tagged
 
 
 
 
 
 
 
 
 
 
17
  # -----------------------------
18
  def heading_replacer(m):
19
  nonlocal counter
20
- # Get the full heading match
21
- full_heading = m.group(0)
22
- # Extract just the hash symbols
23
- hashes = m.group(1)
24
- # Insert a newline before and after the tag
25
- replacement = f"\n[INSERT_POINT_{counter:03d}]\n{full_heading}"
26
  counter += 1
27
- return replacement
28
 
29
- # Match headings that are at the start of a line (preceded by newline or start of string)
30
- # We capture the full heading pattern including content after the hashes
31
- text = re.sub(r'(^|\n)(\#{1,6}[ \t].+?)(?=\n|$)', heading_replacer, text)
32
 
33
  # -----------------------------
34
- # 3) INSERT A TAG BEFORE EACH PARAGRAPH BREAK
35
- # These are newlines that aren't already tagged
36
  # -----------------------------
37
- def newline_replacer(_match):
38
  nonlocal counter
39
- # Insert tag on its own line
40
- out = f"\n[INSERT_POINT_{counter:03d}]\n"
41
  counter += 1
42
- return out
43
 
44
- # Replace newlines that aren't already preceded by a tag
45
- text = re.sub(r'(?<!\[INSERT_POINT_\d{3}]\n)\n', newline_replacer, text)
46
 
47
  # -----------------------------
48
- # 4) CLEAN UP: Remove any extra newlines before tags
49
- # (Sometimes we might have created double newlines)
50
  # -----------------------------
51
- text = re.sub(r'\n\n(\[INSERT_POINT_\d{3}])', r'\n\1', text)
52
-
53
- # If text doesn't start with a newline + tag, add one at the beginning
54
  if not text.startswith('\n[INSERT_POINT_'):
55
  text = f"\n[INSERT_POINT_{counter:03d}]\n" + text
56
  counter += 1
57
 
 
 
 
 
 
 
 
 
58
  return text
59
 
60
 
@@ -71,10 +82,11 @@ demo = gr.Interface(
71
  outputs=gr.Textbox(label="Processed Text with Tags"),
72
  title="Insert Point Tagger",
73
  description=(
74
- "1) Collapses multiple blank lines into one.\n"
75
- "2) Inserts [INSERT_POINT_XXX] tags on their own lines.\n"
76
- "3) Properly tags headings and paragraph breaks.\n"
77
- "4) Maintains sequential numbering for all tags."
 
78
  ),
79
  )
80
 
 
6
  counter = 1
7
 
8
  # -----------------------------
9
+ # 0) NORMALIZE LINE ENDINGS AND REMOVE EXCESS WHITESPACE
 
10
  # -----------------------------
11
+ text = text.strip()
12
+ # Replace Windows-style line endings
13
+ text = text.replace('\r\n', '\n')
14
+ # Collapse multiple blank lines into one
15
  text = re.sub(r'\n\s*\n+', '\n', text)
16
 
17
  # -----------------------------
18
+ # 1) FIRST PASS: INSERT NEWLINES BEFORE HEADINGS
19
+ # This separates headings that might be in running text
20
+ # -----------------------------
21
+ text = re.sub(r'(\s+)(#+\s+)', r'\1\n\2', text)
22
+
23
+ # Special case for heading at the start without newline
24
+ if text.startswith('#'):
25
+ text = '\n' + text
26
+
27
+ # -----------------------------
28
+ # 2) TAG ALL HEADINGS
29
+ # Now that headings are on their own lines, tag them
30
  # -----------------------------
31
  def heading_replacer(m):
32
  nonlocal counter
33
+ heading = m.group(1) # The entire heading including hash symbols
34
+ tag = f"[INSERT_POINT_{counter:03d}]\n"
 
 
 
 
35
  counter += 1
36
+ return f"\n{tag}{heading}"
37
 
38
+ # Look for any heading pattern (one or more # followed by space and text)
39
+ text = re.sub(r'\n(#+\s+.+?)(?=\n|$)', heading_replacer, text)
 
40
 
41
  # -----------------------------
42
+ # 3) TAG PARAGRAPH BREAKS
43
+ # Add tags at meaningful paragraph breaks
44
  # -----------------------------
45
+ def paragraph_replacer(m):
46
  nonlocal counter
47
+ tag = f"[INSERT_POINT_{counter:03d}]\n"
 
48
  counter += 1
49
+ return f"\n{tag}"
50
 
51
+ # Find newlines that aren't already followed by a tag
52
+ text = re.sub(r'\n(?!\[INSERT_POINT_)', paragraph_replacer, text)
53
 
54
  # -----------------------------
55
+ # 4) ADD TAG AT THE BEGINNING IF NEEDED
 
56
  # -----------------------------
 
 
 
57
  if not text.startswith('\n[INSERT_POINT_'):
58
  text = f"\n[INSERT_POINT_{counter:03d}]\n" + text
59
  counter += 1
60
 
61
+ # -----------------------------
62
+ # 5) CLEAN UP: Remove excess newlines
63
+ # -----------------------------
64
+ # Remove extra blank lines before tags
65
+ text = re.sub(r'\n\n+(\[INSERT_POINT_)', r'\n\1', text)
66
+ # Remove extra blank lines at the beginning
67
+ text = re.sub(r'^\n+', '', text)
68
+
69
  return text
70
 
71
 
 
82
  outputs=gr.Textbox(label="Processed Text with Tags"),
83
  title="Insert Point Tagger",
84
  description=(
85
+ "This processor:\n"
86
+ "1) Separates and tags headings (text starting with #).\n"
87
+ "2) Tags paragraph breaks with sequential numbers.\n"
88
+ "3) Places each tag on its own line.\n"
89
+ "4) Works with heading patterns that may be embedded in running text."
90
  ),
91
  )
92