NeerajCodz commited on
Commit
9c16219
·
1 Parent(s): 0735175

fix: improve complex link extraction for Twitch-style markdown

Browse files
Files changed (1) hide show
  1. backend/app/api/routes/scrape.py +109 -26
backend/app/api/routes/scrape.py CHANGED
@@ -702,9 +702,13 @@ def _extract_markdown_link_rows(
702
 
703
  # Patterns for extracting content
704
  # Match markdown links like [Title](URL) but NOT image links like ![Image](URL)
705
- content_link_pattern = re.compile(r'(?<!!)\[([^\]]+)\]\((https?://[^)]+)\)')
706
- # Match view counts anywhere
707
- views_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*views?', re.IGNORECASE)
 
 
 
 
708
  likes_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*likes?', re.IGNORECASE)
709
  comments_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*comments?', re.IGNORECASE)
710
  date_pattern = re.compile(r'\b(today|yesterday|\d+\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\s+ago)\b', re.IGNORECASE)
@@ -754,6 +758,57 @@ def _extract_markdown_link_rows(
754
  if any(label == lowered_line for label in boilerplate_labels):
755
  continue
756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
  # Find content links (not images)
758
  for match in content_link_pattern.finditer(line):
759
  title = match.group(1).strip()
@@ -847,30 +902,58 @@ def _extract_markdown_link_rows(
847
  candidate_rows.append((quality_score, row))
848
 
849
  # Also look for standalone lines with view counts (sometimes titles are separate from links)
850
- for i, views in line_views.items():
851
- if i > 0:
852
- prev_line = lines[i - 1].strip()
853
- # Check if previous line might be a title
854
- if len(prev_line) > 20 and not prev_line.startswith("![") and not prev_line.startswith("http"):
855
- title_normalized = prev_line.lower()[:50]
856
- if title_normalized not in seen_titles:
857
- row = {}
858
- for col in columns:
859
- lower_col = col.lower()
860
- if lower_col in {"title", "name", "text"}:
861
- row[col] = prev_line[:160]
862
- elif lower_col in {"views", "view_count", "viewers"}:
863
- row[col] = views
864
- elif lower_col in {"url", "link", "href"}:
865
- row[col] = source_url
866
- else:
867
- row[col] = ""
868
- seen_titles.add(title_normalized)
869
- candidate_rows.append((2, row)) # Lower score for these
870
-
871
- # Sort by score and return top rows
 
 
 
 
 
 
 
 
 
 
 
 
 
872
  candidate_rows.sort(key=lambda x: x[0], reverse=True)
873
- return [row for _, row in candidate_rows[:row_limit]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
874
 
875
 
876
  def _extract_rows_from_text_render(
 
702
 
703
  # Patterns for extracting content
704
  # Match markdown links like [Title](URL) but NOT image links like ![Image](URL)
705
+ # URL ends at first space, quote, or closing paren
706
+ content_link_pattern = re.compile(r'(?<!!)\[([^\]]+)\]\((https?://[^\s"\)]+)')
707
+ # Match complex links with embedded images: [![Image](img_url) Text](link_url)
708
+ # This captures the text after the image and the final link
709
+ complex_link_pattern = re.compile(r'\[!\[Image[^\]]*\]\([^\)]+\)\s*([^\]]+)\]\((https?://[^\s"\)]+)\)')
710
+ # Match view/viewer counts anywhere (including "47.2K viewers" format)
711
+ views_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*(?:views?|viewers?)', re.IGNORECASE)
712
  likes_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*likes?', re.IGNORECASE)
713
  comments_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*comments?', re.IGNORECASE)
714
  date_pattern = re.compile(r'\b(today|yesterday|\d+\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\s+ago)\b', re.IGNORECASE)
 
758
  if any(label == lowered_line for label in boilerplate_labels):
759
  continue
760
 
761
+ # First check for complex links (like Twitch format: [![Image](url) StreamerName Game Live 22K viewers](channel_url))
762
+ complex_match = complex_link_pattern.search(line)
763
+ if complex_match:
764
+ embedded_text = complex_match.group(1).strip()
765
+ link = complex_match.group(2).strip()
766
+
767
+ # Parse embedded text: "StreamerName Game Live 22K 22K viewers Use the..."
768
+ # Remove "Use the..." suffix and Hype Train info
769
+ embedded_text = re.sub(r'\s*Use the.*$', '', embedded_text)
770
+ embedded_text = re.sub(r'\s*Hype Train.*$', '', embedded_text, flags=re.IGNORECASE)
771
+ # Extract viewer count first
772
+ viewer_match = views_pattern.search(embedded_text)
773
+ viewers = viewer_match.group(1) if viewer_match else ""
774
+ # Remove ALL occurrences of viewer count patterns, including orphan "ers"/"viewers"
775
+ name_game = re.sub(r'\d+(?:[.,]\d+)?[KkMmBb]?\s*(?:views?|viewers?|ers)?', '', embedded_text, flags=re.IGNORECASE)
776
+ # Remove standalone "ers" or "viewers" that might remain
777
+ name_game = re.sub(r'\b(?:ers|viewers?|views?)\b', '', name_game, flags=re.IGNORECASE)
778
+ # Remove "Live"
779
+ name_game = re.sub(r'\bLive\b', '', name_game, flags=re.IGNORECASE)
780
+ # Collapse whitespace
781
+ name_game = re.sub(r'\s+', ' ', name_game).strip()
782
+
783
+ # Split into name and game (heuristic: first word is name, rest is game)
784
+ parts = name_game.split(maxsplit=1)
785
+ streamer_name = parts[0] if parts else ""
786
+ game = parts[1].strip() if len(parts) > 1 else ""
787
+
788
+ if streamer_name and link:
789
+ link_normalized = link.split('?')[0]
790
+ if link_normalized not in seen_links:
791
+ seen_links.add(link_normalized)
792
+
793
+ row: dict[str, Any] = {}
794
+ for col in columns:
795
+ lower_col = col.lower()
796
+ if lower_col in {"url", "link", "href", "channel"}:
797
+ row[col] = link
798
+ elif lower_col in {"title", "name", "streamer_name", "streamer", "username"}:
799
+ row[col] = streamer_name
800
+ elif lower_col in {"game", "category", "playing"}:
801
+ row[col] = game
802
+ elif lower_col in {"views", "view_count", "viewers", "viewer_count"}:
803
+ row[col] = viewers
804
+ else:
805
+ row[col] = ""
806
+
807
+ # Streams with viewers are highly relevant
808
+ score = 5 if viewers else 2
809
+ candidate_rows.append((score, row))
810
+ continue # Move to next line
811
+
812
  # Find content links (not images)
813
  for match in content_link_pattern.finditer(line):
814
  title = match.group(1).strip()
 
902
  candidate_rows.append((quality_score, row))
903
 
904
  # Also look for standalone lines with view counts (sometimes titles are separate from links)
905
+ # But only if we haven't found enough rows with proper links
906
+ if len(candidate_rows) < row_limit:
907
+ for i, views in line_views.items():
908
+ if i > 0 and len(candidate_rows) < row_limit * 2:
909
+ prev_line = lines[i - 1].strip()
910
+ # Check if previous line might be a title
911
+ if len(prev_line) > 20 and not prev_line.startswith("![") and not prev_line.startswith("http"):
912
+ title_normalized = prev_line.lower()[:50]
913
+ if title_normalized not in seen_titles:
914
+ # Look for a nearby link
915
+ nearby_link = None
916
+ for offset in range(-3, 4):
917
+ check_idx = i + offset
918
+ if 0 <= check_idx < len(lines):
919
+ link_match = content_link_pattern.search(lines[check_idx])
920
+ if link_match and "watch" in link_match.group(2).lower():
921
+ nearby_link = link_match.group(2)
922
+ break
923
+
924
+ if nearby_link: # Only add if we found a real link
925
+ row = {}
926
+ for col in columns:
927
+ lower_col = col.lower()
928
+ if lower_col in {"title", "name", "text"}:
929
+ row[col] = prev_line[:160]
930
+ elif lower_col in {"views", "view_count", "viewers"}:
931
+ row[col] = views
932
+ elif lower_col in {"url", "link", "href"}:
933
+ row[col] = nearby_link
934
+ else:
935
+ row[col] = ""
936
+ seen_titles.add(title_normalized)
937
+ candidate_rows.append((2, row)) # Lower score for these
938
+
939
+ # Sort by score (higher is better) and filter out items without views when we have enough with views
940
  candidate_rows.sort(key=lambda x: x[0], reverse=True)
941
+
942
+ # Prefer rows with views
943
+ with_views = [(score, row) for score, row in candidate_rows if row.get("views") or row.get("view_count")]
944
+ without_views = [(score, row) for score, row in candidate_rows if not (row.get("views") or row.get("view_count"))]
945
+
946
+ result = []
947
+ for _, row in with_views[:row_limit]:
948
+ result.append(row)
949
+
950
+ # Fill remaining slots with rows without views
951
+ remaining = row_limit - len(result)
952
+ if remaining > 0:
953
+ for _, row in without_views[:remaining]:
954
+ result.append(row)
955
+
956
+ return result
957
 
958
 
959
  def _extract_rows_from_text_render(