Spaces:

BtB-ExpC
/

InsertElementsTags

Sleeping

App Files Files Community

BtB-ExpC commited on Apr 14, 2025

Commit

f414276

1 Parent(s): 96bc994

ignore consecutive newlines

Browse files

Files changed (1) hide show

app.py +39 -31

app.py CHANGED Viewed

@@ -2,43 +2,49 @@ import re
 import gradio as gr
 def insert_points(text):
-    # Initialize a counter accessible by the replacer function
     counter = 1
-    # This function will be called for each match found by re.sub
-    # It inserts the tag *before* the matched text (newline or hash sequence)
     def replacer(match):
-        nonlocal counter  # Use the counter from the outer scope
         tag = f"[INSERT_POINT_{counter:03d}]"
         counter += 1
-        # match.group(0) contains the actual matched string ('\n' or '##' etc.)
         return tag + match.group(0)
-    # The pattern looks for either:
-    # 1) A newline character ('\n')
-    # 2) A sequence of one or more '#' characters ('\#+')
-    #    '#' needs to be escaped ('\#') because it's a special regex character.
-    # The parentheses create capturing groups, but match.group(0) gives the whole match anyway.
-    pattern = r'(\n|\#+)'
-    # Use re.sub to find all matches of the pattern and replace them
-    # by calling the 'replacer' function for each match.
-    processed_text = re.sub(pattern, replacer, text)
-    # One edge case: If the *very beginning* of the text starts with '#',
-    # the regex above won't match anything *before* it.
-    # We need to check if the text starts with hashes (possibly after whitespace)
-    # and prepend the first tag if necessary.
-    # However, the current `re.sub(pattern, replacer, text)` already handles this
-    # correctly because it finds the '#' sequence at the beginning and the
-    # replacer adds the tag *before* it. Let's re-verify.
-    # Example: If text is "## Title", pattern finds "##" at index 0.
-    # Replacer runs, returns "[INSERT_POINT_001]##". Result is correct.
-    # So, no special handling for the beginning is needed with this pattern.
     return processed_text
-# --- Gradio Interface Code (Unchanged from your original) ---
 demo = gr.Interface(
     fn=insert_points,
     inputs=gr.Textbox(
@@ -46,13 +52,15 @@ demo = gr.Interface(
         placeholder="Paste your text here...",
         label="Your Input Text"
     ),
-    outputs=gr.Textbox(label="Processed Text with Tags"), # Changed output type to Textbox for better viewing
     title="Insert Point Tagger",
-    # Updated description for clarity
     description=(
-        "Paste a block of text and get '[INSERT_POINT_###]' tags added "
-        "1) **before** every newline, and 2) **before** every '#' sequence (heading)."
     ),
 )
 if __name__ == "__main__":

 import gradio as gr
 def insert_points(text):
     counter = 1
+    # This function will be called for each match by re.sub in the first pass
     def replacer(match):
+        nonlocal counter
         tag = f"[INSERT_POINT_{counter:03d}]"
         counter += 1
+        # Return the tag followed by the original matched text
+        # group(0) will be one or more newlines OR one or more hashes
         return tag + match.group(0)
+    # --- Step 1: Insert tags before hash sequences and consolidated newline sequences ---
+    # Pattern matches:
+    # 1) '\n+' : One or more consecutive newline characters. This handles basic blank lines.
+    #             To handle lines with only whitespace, a more complex pattern like
+    #             '(\s*\n)+\s*' might be needed, but let's stick to '\n+' based on the example.
+    #             This change ensures that \n\n or \n\n\n only trigger ONE tag.
+    # 2) '\#+' : One or more consecutive '#' characters.
+    # This pattern might still create "[TAG1]\n[TAG2]###" if a newline immediately precedes a heading.
+    pattern_initial = r'(\n+|\#+)'
+    processed_text = re.sub(pattern_initial, replacer, text)
+    # --- Step 2: Clean up potential heading splits ---
+    # This step addresses the case where Step 1 resulted in:
+    # [INSERT_POINT_XXX]<whitespace like \n>[INSERT_POINT_YYY]### Heading
+    # We want to remove the first tag and the intermediate whitespace, keeping the tag associated with the ###.
+    # The cleanup pattern finds:
+    #   (\[INSERT_POINT_\d{3}\]) : Capture Group 1: The tag before the newline/whitespace (e.g., TAG_XXX)
+    #   \s* : Any intermediate whitespace (importantly, including the newline)
+    #   (\[INSERT_POINT_\d{3}\]) : Capture Group 2: The tag right before the hashes (e.g., TAG_YYY)
+    #   (\#+)                    : Capture Group 3: The actual hash sequence (e.g., ###)
+    cleanup_pattern = r'(\[INSERT_POINT_\d{3}\])\s*(\[INSERT_POINT_\d{3}\])(\#+)'
+    # The replacement uses:
+    #   \2 : Capture Group 2 (the tag we want to keep, TAG_YYY)
+    #   \3 : Capture Group 3 (the hash sequence)
+    # This effectively deletes the first tag (Group 1) and the intermediate whitespace.
+    processed_text = re.sub(cleanup_pattern, r'\2\3', processed_text)
     return processed_text
+# --- Gradio Interface Code (Updated Description) ---
 demo = gr.Interface(
     fn=insert_points,
     inputs=gr.Textbox(
         placeholder="Paste your text here...",
         label="Your Input Text"
     ),
+    outputs=gr.Textbox(label="Processed Text with Tags"),
     title="Insert Point Tagger",
     description=(
+        "Paste a block of text and get '[INSERT_POINT_###]' tags added:\n"
+        "1) **before** each sequence of one or more '#' characters (headings).\n"
+        "2) **before** each sequence of one or more newline characters (e.g., one tag for line breaks or blank lines)."
     ),
+    # You might add allow_flagging='never' if you don't need the flagging feature
+    # allow_flagging='never'
 )
 if __name__ == "__main__":