Spaces:
Running
Running
Commit ·
9c16219
1
Parent(s): 0735175
fix: improve complex link extraction for Twitch-style markdown
Browse files- backend/app/api/routes/scrape.py +109 -26
backend/app/api/routes/scrape.py
CHANGED
|
@@ -702,9 +702,13 @@ def _extract_markdown_link_rows(
|
|
| 702 |
|
| 703 |
# Patterns for extracting content
|
| 704 |
# Match markdown links like [Title](URL) but NOT image links like 
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 708 |
likes_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*likes?', re.IGNORECASE)
|
| 709 |
comments_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*comments?', re.IGNORECASE)
|
| 710 |
date_pattern = re.compile(r'\b(today|yesterday|\d+\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\s+ago)\b', re.IGNORECASE)
|
|
@@ -754,6 +758,57 @@ def _extract_markdown_link_rows(
|
|
| 754 |
if any(label == lowered_line for label in boilerplate_labels):
|
| 755 |
continue
|
| 756 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 757 |
# Find content links (not images)
|
| 758 |
for match in content_link_pattern.finditer(line):
|
| 759 |
title = match.group(1).strip()
|
|
@@ -847,30 +902,58 @@ def _extract_markdown_link_rows(
|
|
| 847 |
candidate_rows.append((quality_score, row))
|
| 848 |
|
| 849 |
# Also look for standalone lines with view counts (sometimes titles are separate from links)
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
if
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 872 |
candidate_rows.sort(key=lambda x: x[0], reverse=True)
|
| 873 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 874 |
|
| 875 |
|
| 876 |
def _extract_rows_from_text_render(
|
|
|
|
| 702 |
|
| 703 |
# Patterns for extracting content
|
| 704 |
# Match markdown links like [Title](URL) but NOT image links like 
|
| 705 |
+
# URL ends at first space, quote, or closing paren
|
| 706 |
+
content_link_pattern = re.compile(r'(?<!!)\[([^\]]+)\]\((https?://[^\s"\)]+)')
|
| 707 |
+
# Match complex links with embedded images: [ Text](link_url)
|
| 708 |
+
# This captures the text after the image and the final link
|
| 709 |
+
complex_link_pattern = re.compile(r'\[!\[Image[^\]]*\]\([^\)]+\)\s*([^\]]+)\]\((https?://[^\s"\)]+)\)')
|
| 710 |
+
# Match view/viewer counts anywhere (including "47.2K viewers" format)
|
| 711 |
+
views_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*(?:views?|viewers?)', re.IGNORECASE)
|
| 712 |
likes_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*likes?', re.IGNORECASE)
|
| 713 |
comments_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*comments?', re.IGNORECASE)
|
| 714 |
date_pattern = re.compile(r'\b(today|yesterday|\d+\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\s+ago)\b', re.IGNORECASE)
|
|
|
|
| 758 |
if any(label == lowered_line for label in boilerplate_labels):
|
| 759 |
continue
|
| 760 |
|
| 761 |
+
# First check for complex links (like Twitch format: [ StreamerName Game Live 22K viewers](channel_url))
|
| 762 |
+
complex_match = complex_link_pattern.search(line)
|
| 763 |
+
if complex_match:
|
| 764 |
+
embedded_text = complex_match.group(1).strip()
|
| 765 |
+
link = complex_match.group(2).strip()
|
| 766 |
+
|
| 767 |
+
# Parse embedded text: "StreamerName Game Live 22K 22K viewers Use the..."
|
| 768 |
+
# Remove "Use the..." suffix and Hype Train info
|
| 769 |
+
embedded_text = re.sub(r'\s*Use the.*$', '', embedded_text)
|
| 770 |
+
embedded_text = re.sub(r'\s*Hype Train.*$', '', embedded_text, flags=re.IGNORECASE)
|
| 771 |
+
# Extract viewer count first
|
| 772 |
+
viewer_match = views_pattern.search(embedded_text)
|
| 773 |
+
viewers = viewer_match.group(1) if viewer_match else ""
|
| 774 |
+
# Remove ALL occurrences of viewer count patterns, including orphan "ers"/"viewers"
|
| 775 |
+
name_game = re.sub(r'\d+(?:[.,]\d+)?[KkMmBb]?\s*(?:views?|viewers?|ers)?', '', embedded_text, flags=re.IGNORECASE)
|
| 776 |
+
# Remove standalone "ers" or "viewers" that might remain
|
| 777 |
+
name_game = re.sub(r'\b(?:ers|viewers?|views?)\b', '', name_game, flags=re.IGNORECASE)
|
| 778 |
+
# Remove "Live"
|
| 779 |
+
name_game = re.sub(r'\bLive\b', '', name_game, flags=re.IGNORECASE)
|
| 780 |
+
# Collapse whitespace
|
| 781 |
+
name_game = re.sub(r'\s+', ' ', name_game).strip()
|
| 782 |
+
|
| 783 |
+
# Split into name and game (heuristic: first word is name, rest is game)
|
| 784 |
+
parts = name_game.split(maxsplit=1)
|
| 785 |
+
streamer_name = parts[0] if parts else ""
|
| 786 |
+
game = parts[1].strip() if len(parts) > 1 else ""
|
| 787 |
+
|
| 788 |
+
if streamer_name and link:
|
| 789 |
+
link_normalized = link.split('?')[0]
|
| 790 |
+
if link_normalized not in seen_links:
|
| 791 |
+
seen_links.add(link_normalized)
|
| 792 |
+
|
| 793 |
+
row: dict[str, Any] = {}
|
| 794 |
+
for col in columns:
|
| 795 |
+
lower_col = col.lower()
|
| 796 |
+
if lower_col in {"url", "link", "href", "channel"}:
|
| 797 |
+
row[col] = link
|
| 798 |
+
elif lower_col in {"title", "name", "streamer_name", "streamer", "username"}:
|
| 799 |
+
row[col] = streamer_name
|
| 800 |
+
elif lower_col in {"game", "category", "playing"}:
|
| 801 |
+
row[col] = game
|
| 802 |
+
elif lower_col in {"views", "view_count", "viewers", "viewer_count"}:
|
| 803 |
+
row[col] = viewers
|
| 804 |
+
else:
|
| 805 |
+
row[col] = ""
|
| 806 |
+
|
| 807 |
+
# Streams with viewers are highly relevant
|
| 808 |
+
score = 5 if viewers else 2
|
| 809 |
+
candidate_rows.append((score, row))
|
| 810 |
+
continue # Move to next line
|
| 811 |
+
|
| 812 |
# Find content links (not images)
|
| 813 |
for match in content_link_pattern.finditer(line):
|
| 814 |
title = match.group(1).strip()
|
|
|
|
| 902 |
candidate_rows.append((quality_score, row))
|
| 903 |
|
| 904 |
# Also look for standalone lines with view counts (sometimes titles are separate from links)
|
| 905 |
+
# But only if we haven't found enough rows with proper links
|
| 906 |
+
if len(candidate_rows) < row_limit:
|
| 907 |
+
for i, views in line_views.items():
|
| 908 |
+
if i > 0 and len(candidate_rows) < row_limit * 2:
|
| 909 |
+
prev_line = lines[i - 1].strip()
|
| 910 |
+
# Check if previous line might be a title
|
| 911 |
+
if len(prev_line) > 20 and not prev_line.startswith("![") and not prev_line.startswith("http"):
|
| 912 |
+
title_normalized = prev_line.lower()[:50]
|
| 913 |
+
if title_normalized not in seen_titles:
|
| 914 |
+
# Look for a nearby link
|
| 915 |
+
nearby_link = None
|
| 916 |
+
for offset in range(-3, 4):
|
| 917 |
+
check_idx = i + offset
|
| 918 |
+
if 0 <= check_idx < len(lines):
|
| 919 |
+
link_match = content_link_pattern.search(lines[check_idx])
|
| 920 |
+
if link_match and "watch" in link_match.group(2).lower():
|
| 921 |
+
nearby_link = link_match.group(2)
|
| 922 |
+
break
|
| 923 |
+
|
| 924 |
+
if nearby_link: # Only add if we found a real link
|
| 925 |
+
row = {}
|
| 926 |
+
for col in columns:
|
| 927 |
+
lower_col = col.lower()
|
| 928 |
+
if lower_col in {"title", "name", "text"}:
|
| 929 |
+
row[col] = prev_line[:160]
|
| 930 |
+
elif lower_col in {"views", "view_count", "viewers"}:
|
| 931 |
+
row[col] = views
|
| 932 |
+
elif lower_col in {"url", "link", "href"}:
|
| 933 |
+
row[col] = nearby_link
|
| 934 |
+
else:
|
| 935 |
+
row[col] = ""
|
| 936 |
+
seen_titles.add(title_normalized)
|
| 937 |
+
candidate_rows.append((2, row)) # Lower score for these
|
| 938 |
+
|
| 939 |
+
# Sort by score (higher is better) and filter out items without views when we have enough with views
|
| 940 |
candidate_rows.sort(key=lambda x: x[0], reverse=True)
|
| 941 |
+
|
| 942 |
+
# Prefer rows with views
|
| 943 |
+
with_views = [(score, row) for score, row in candidate_rows if row.get("views") or row.get("view_count")]
|
| 944 |
+
without_views = [(score, row) for score, row in candidate_rows if not (row.get("views") or row.get("view_count"))]
|
| 945 |
+
|
| 946 |
+
result = []
|
| 947 |
+
for _, row in with_views[:row_limit]:
|
| 948 |
+
result.append(row)
|
| 949 |
+
|
| 950 |
+
# Fill remaining slots with rows without views
|
| 951 |
+
remaining = row_limit - len(result)
|
| 952 |
+
if remaining > 0:
|
| 953 |
+
for _, row in without_views[:remaining]:
|
| 954 |
+
result.append(row)
|
| 955 |
+
|
| 956 |
+
return result
|
| 957 |
|
| 958 |
|
| 959 |
def _extract_rows_from_text_render(
|