BtB-ExpC commited on
Commit
73a9e74
·
1 Parent(s): f414276

normalized line endings that might come from Word \r

Browse files
Files changed (1) hide show
  1. app.py +23 -29
app.py CHANGED
@@ -2,9 +2,14 @@ import re
2
  import gradio as gr
3
 
4
  def insert_points(text):
 
 
 
 
 
5
  counter = 1
6
 
7
- # This function will be called for each match by re.sub in the first pass
8
  def replacer(match):
9
  nonlocal counter
10
  tag = f"[INSERT_POINT_{counter:03d}]"
@@ -15,52 +20,41 @@ def insert_points(text):
15
 
16
  # --- Step 1: Insert tags before hash sequences and consolidated newline sequences ---
17
  # Pattern matches:
18
- # 1) '\n+' : One or more consecutive newline characters. This handles basic blank lines.
19
- # To handle lines with only whitespace, a more complex pattern like
20
- # '(\s*\n)+\s*' might be needed, but let's stick to '\n+' based on the example.
21
- # This change ensures that \n\n or \n\n\n only trigger ONE tag.
22
  # 2) '\#+' : One or more consecutive '#' characters.
23
- # This pattern might still create "[TAG1]\n[TAG2]###" if a newline immediately precedes a heading.
24
  pattern_initial = r'(\n+|\#+)'
25
  processed_text = re.sub(pattern_initial, replacer, text)
26
 
27
  # --- Step 2: Clean up potential heading splits ---
28
- # This step addresses the case where Step 1 resulted in:
29
- # [INSERT_POINT_XXX]<whitespace like \n>[INSERT_POINT_YYY]### Heading
30
- # We want to remove the first tag and the intermediate whitespace, keeping the tag associated with the ###.
31
-
32
- # The cleanup pattern finds:
33
- # (\[INSERT_POINT_\d{3}\]) : Capture Group 1: The tag before the newline/whitespace (e.g., TAG_XXX)
34
- # \s* : Any intermediate whitespace (importantly, including the newline)
35
- # (\[INSERT_POINT_\d{3}\]) : Capture Group 2: The tag right before the hashes (e.g., TAG_YYY)
36
- # (\#+) : Capture Group 3: The actual hash sequence (e.g., ###)
37
  cleanup_pattern = r'(\[INSERT_POINT_\d{3}\])\s*(\[INSERT_POINT_\d{3}\])(\#+)'
38
-
39
- # The replacement uses:
40
- # \2 : Capture Group 2 (the tag we want to keep, TAG_YYY)
41
- # \3 : Capture Group 3 (the hash sequence)
42
- # This effectively deletes the first tag (Group 1) and the intermediate whitespace.
43
  processed_text = re.sub(cleanup_pattern, r'\2\3', processed_text)
44
 
45
  return processed_text
46
 
47
- # --- Gradio Interface Code (Updated Description) ---
48
  demo = gr.Interface(
49
  fn=insert_points,
50
  inputs=gr.Textbox(
51
  lines=10,
52
- placeholder="Paste your text here...",
53
  label="Your Input Text"
54
  ),
55
- outputs=gr.Textbox(label="Processed Text with Tags"),
56
- title="Insert Point Tagger",
 
 
 
57
  description=(
58
- "Paste a block of text and get '[INSERT_POINT_###]' tags added:\n"
59
- "1) **before** each sequence of one or more '#' characters (headings).\n"
60
- "2) **before** each sequence of one or more newline characters (e.g., one tag for line breaks or blank lines)."
 
61
  ),
62
- # You might add allow_flagging='never' if you don't need the flagging feature
63
- # allow_flagging='never'
64
  )
65
 
66
  if __name__ == "__main__":
 
2
  import gradio as gr
3
 
4
  def insert_points(text):
5
+ # --- Step 0: Normalize line endings ---
6
+ # Replace Windows (\r\n) and old Mac (\r) line endings with Unix (\n)
7
+ # This simplifies the regex patterns below.
8
+ text = text.replace('\r\n', '\n').replace('\r', '\n')
9
+
10
  counter = 1
11
 
12
+ # Replacer function for the initial insertion pass
13
  def replacer(match):
14
  nonlocal counter
15
  tag = f"[INSERT_POINT_{counter:03d}]"
 
20
 
21
  # --- Step 1: Insert tags before hash sequences and consolidated newline sequences ---
22
  # Pattern matches:
23
+ # 1) '\n+' : One or more consecutive (now normalized) newline characters. Collapses blank lines.
 
 
 
24
  # 2) '\#+' : One or more consecutive '#' characters.
 
25
  pattern_initial = r'(\n+|\#+)'
26
  processed_text = re.sub(pattern_initial, replacer, text)
27
 
28
  # --- Step 2: Clean up potential heading splits ---
29
+ # This fixes cases where Step 1 resulted in "[TAG_A]\n[TAG_B]###"
30
+ # It looks for TAG_A, followed by whitespace (\s* includes the \n),
31
+ # followed by TAG_B, followed by hashes (#+).
 
 
 
 
 
 
32
  cleanup_pattern = r'(\[INSERT_POINT_\d{3}\])\s*(\[INSERT_POINT_\d{3}\])(\#+)'
33
+ # Replaces the whole match with just TAG_B (\2) and the hashes (\3).
 
 
 
 
34
  processed_text = re.sub(cleanup_pattern, r'\2\3', processed_text)
35
 
36
  return processed_text
37
 
38
+ # --- Gradio Interface Code (with Copy Button) ---
39
  demo = gr.Interface(
40
  fn=insert_points,
41
  inputs=gr.Textbox(
42
  lines=10,
43
+ placeholder="Paste your text here...\n(Line endings will be normalized)",
44
  label="Your Input Text"
45
  ),
46
+ outputs=gr.Textbox(
47
+ label="Processed Text with Tags",
48
+ show_copy_button=True # <--- Added copy button here
49
+ ),
50
+ title="Insert Point Tagger v3", # Optional: update title
51
  description=(
52
+ "Paste text to add '[INSERT_POINT_###]' tags:\n"
53
+ "1) **Before** each '#' sequence (headings).\n"
54
+ "2) **Before** each sequence of one or more newlines (one tag per line break/blank line).\n"
55
+ "Normalizes line endings and cleans up heading tag spacing."
56
  ),
57
+ allow_flagging='never'
 
58
  )
59
 
60
  if __name__ == "__main__":