BtB-ExpC commited on
Commit
c6710ef
·
1 Parent(s): d4f7222

new approach

Browse files
Files changed (1) hide show
  1. app.py +36 -29
app.py CHANGED
@@ -12,40 +12,48 @@ def insert_points(text):
12
  text = re.sub(r'\n\s*\n+', '\n', text)
13
 
14
  # -----------------------------
15
- # 2) INSERT A TAG BEFORE EACH REMAINING NEWLINE
16
- # So every line (including blank ones) is prefixed by exactly one tag
17
  # -----------------------------
18
- def newline_replacer(_match):
19
  nonlocal counter
20
- out = f"[INSERT_POINT_{counter:03d}]\n"
 
 
 
 
 
21
  counter += 1
22
- return out
23
 
24
- text = re.sub(r'\n', newline_replacer, text)
 
 
25
 
26
  # -----------------------------
27
- # 3) INSERT A TAG BEFORE EACH HEADING IF IT'S NOT
28
- # (A) ALREADY TAGGED, OR
29
- # (B) IMMEDIATELY AFTER A NEWLINE (WHICH WOULD HAVE A TAG)
30
- #
31
- # We detect # or ## or ### up to 6 in a row. If there's no
32
- # [INSERT_POINT_\d{3}] or newline right before, we insert a new tag.
33
  # -----------------------------
34
- def heading_replacer(m):
35
  nonlocal counter
36
- # group(1) is the string of hashes, like "##" or "#####"
37
- hashes = m.group(1)
38
- replacement = f"[INSERT_POINT_{counter:03d}]{hashes}"
39
  counter += 1
40
- return replacement
41
 
42
- # Negative lookbehind means:
43
- # - Not preceded by [INSERT_POINT_###]
44
- # - Not preceded by a newline
45
- # So if the line starts with "##" and we've just inserted a tag for the line break,
46
- # we won't insert a second tag for the heading.
47
- pattern = r'(?<!\[INSERT_POINT_\d{3}\])(?<!\n)(\#{1,6})'
48
- text = re.sub(pattern, heading_replacer, text)
 
 
 
 
 
 
49
 
50
  return text
51
 
@@ -64,12 +72,11 @@ demo = gr.Interface(
64
  title="Insert Point Tagger",
65
  description=(
66
  "1) Collapses multiple blank lines into one.\n"
67
- "2) Inserts a single [INSERT_POINT_XXX] tag before each remaining newline.\n"
68
- "3) Inserts a single [INSERT_POINT_XXX] tag before any '#' heading, unless\n"
69
- " there's already a tag from the preceding newline.\n"
70
- "This avoids double tags while keeping heading text on the same line."
71
  ),
72
  )
73
 
74
  if __name__ == "__main__":
75
- demo.launch()
 
12
  text = re.sub(r'\n\s*\n+', '\n', text)
13
 
14
  # -----------------------------
15
+ # 2) PROCESS HEADINGS FIRST (before dealing with newlines)
16
+ # This ensures headings get properly tagged
17
  # -----------------------------
18
+ def heading_replacer(m):
19
  nonlocal counter
20
+ # Get the full heading match
21
+ full_heading = m.group(0)
22
+ # Extract just the hash symbols
23
+ hashes = m.group(1)
24
+ # Insert a newline before and after the tag
25
+ replacement = f"\n[INSERT_POINT_{counter:03d}]\n{full_heading}"
26
  counter += 1
27
+ return replacement
28
 
29
+ # Match headings that are at the start of a line (preceded by newline or start of string)
30
+ # We capture the full heading pattern including content after the hashes
31
+ text = re.sub(r'(^|\n)(\#{1,6}[ \t].+?)(?=\n|$)', heading_replacer, text)
32
 
33
  # -----------------------------
34
+ # 3) INSERT A TAG BEFORE EACH PARAGRAPH BREAK
35
+ # These are newlines that aren't already tagged
 
 
 
 
36
  # -----------------------------
37
+ def newline_replacer(_match):
38
  nonlocal counter
39
+ # Insert tag on its own line
40
+ out = f"\n[INSERT_POINT_{counter:03d}]\n"
 
41
  counter += 1
42
+ return out
43
 
44
+ # Replace newlines that aren't already preceded by a tag
45
+ text = re.sub(r'(?<!\[INSERT_POINT_\d{3}]\n)\n', newline_replacer, text)
46
+
47
+ # -----------------------------
48
+ # 4) CLEAN UP: Remove any extra newlines before tags
49
+ # (Sometimes we might have created double newlines)
50
+ # -----------------------------
51
+ text = re.sub(r'\n\n(\[INSERT_POINT_\d{3}])', r'\n\1', text)
52
+
53
+ # If text doesn't start with a newline + tag, add one at the beginning
54
+ if not text.startswith('\n[INSERT_POINT_'):
55
+ text = f"\n[INSERT_POINT_{counter:03d}]\n" + text
56
+ counter += 1
57
 
58
  return text
59
 
 
72
  title="Insert Point Tagger",
73
  description=(
74
  "1) Collapses multiple blank lines into one.\n"
75
+ "2) Inserts [INSERT_POINT_XXX] tags on their own lines.\n"
76
+ "3) Properly tags headings and paragraph breaks.\n"
77
+ "4) Maintains sequential numbering for all tags."
 
78
  ),
79
  )
80
 
81
  if __name__ == "__main__":
82
+ demo.launch()