dataset-tldr

Paused

App Files Files Community

davanstrien HF Staff commited on Apr 8, 2024

Commit

1f7ca14

1 Parent(s): fe9092a

Add card_processing.py for markdown parsing and text loading

Browse files

Files changed (1) hide show

card_processing.py +72 -0

card_processing.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import re
+from huggingface_hub import DatasetCard
+def parse_markdown(markdown_text):
+    lines = markdown_text.split("\n")
+    parsed_lines = []
+    skip_section = False
+    empty_section = True
+    table_of_contents = False
+    more_info_pattern = re.compile(r"\[More Information Needed\]\(https?://\S+\)")
+    html_comment_pattern = re.compile(r"<!--.*?-->")
+    for line in lines:
+        if "Table of Contents" in line:
+            table_of_contents = True
+            continue
+        if table_of_contents:
+            if line.startswith("#"):
+                table_of_contents = False
+            else:
+                continue
+        if line.startswith("#"):
+            if skip_section or empty_section:
+                continue
+            empty_section = True
+        if skip_section:
+            if line.startswith("#"):
+                skip_section = False
+            else:
+                continue
+        if more_info_pattern.match(line.strip()):
+            skip_section = True
+            empty_section = True
+            continue
+        if html_comment_pattern.match(line.strip()):
+            continue
+        if line.strip():
+            empty_section = False
+            parsed_lines.append(line)
+    if skip_section or empty_section:
+        while parsed_lines and parsed_lines[-1].startswith("#"):
+            parsed_lines.pop()
+    return "\n".join(parsed_lines)
+def is_empty_template(text):
+    # Define the placeholder phrases
+    placeholders = [r"\[More Information Needed\]", r"\[optional\]"]
+    # Remove the placeholder phrases from the text
+    for placeholder in placeholders:
+        text = re.sub(placeholder, "", text)
+    # Remove whitespace and newline characters
+    text = text.strip()
+    # Check if the remaining text is empty
+    return not text
+def try_load_text(row):
+    try:
+        return DatasetCard(row["card"]).text
+    except Exception as e:
+        print(e)
+        return None