Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| export_browser_history.py | |
| Cross-platform script to export browsing history from Chromium-based browsers | |
| (Chrome, Chromium, Brave, Edge) and Firefox to CSV format. | |
| Usage examples: | |
| # Export all history to separate CSV files | |
| python export_browser_history.py | |
| # Export all history to a single merged CSV file | |
| python export_browser_history.py --merge | |
| # Export only YouTube watch history | |
| python export_browser_history.py --only youtube --merge | |
| # Export only search engine queries | |
| python export_browser_history.py --only search --output-dir ./searches | |
| # Export history since a specific date | |
| python export_browser_history.py --since 2024-01-01 --merge | |
| # Export with row limit (for testing) | |
| python export_browser_history.py --limit 100 --merge | |
| """ | |
| import argparse | |
| import csv | |
| import os | |
| import platform | |
| import shutil | |
| import sqlite3 | |
| import sys | |
| import tempfile | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import List, Dict, Optional, Tuple | |
| from urllib.parse import urlparse, parse_qs | |
| # Browser database path configurations | |
| def get_browser_paths() -> Dict[str, List[Path]]: | |
| """Return potential browser history database paths for the current OS.""" | |
| system = platform.system() | |
| home = Path.home() | |
| paths = { | |
| 'Chrome': [], | |
| 'Chromium': [], | |
| 'Brave': [], | |
| 'Edge': [], | |
| 'Firefox': [] | |
| } | |
| if system == 'Linux': | |
| paths['Chrome'] = [home / '.config/google-chrome'] | |
| paths['Chromium'] = [home / '.config/chromium'] | |
| paths['Brave'] = [home / '.config/BraveSoftware/Brave-Browser'] | |
| paths['Edge'] = [home / '.config/microsoft-edge'] | |
| paths['Firefox'] = [home / '.mozilla/firefox'] | |
| elif system == 'Darwin': # macOS | |
| paths['Chrome'] = [home / 'Library/Application Support/Google/Chrome'] | |
| paths['Chromium'] = [home / 'Library/Application Support/Chromium'] | |
| paths['Brave'] = [home / 'Library/Application Support/BraveSoftware/Brave-Browser'] | |
| paths['Edge'] = [home / 'Library/Application Support/Microsoft Edge'] | |
| paths['Firefox'] = [home / 'Library/Application Support/Firefox/Profiles'] | |
| elif system == 'Windows': | |
| appdata = Path(os.getenv('LOCALAPPDATA', '')) | |
| appdata_roaming = Path(os.getenv('APPDATA', '')) | |
| if appdata: | |
| paths['Chrome'] = [appdata / 'Google/Chrome/User Data'] | |
| paths['Chromium'] = [appdata / 'Chromium/User Data'] | |
| paths['Brave'] = [appdata / 'BraveSoftware/Brave-Browser/User Data'] | |
| paths['Edge'] = [appdata / 'Microsoft/Edge/User Data'] | |
| if appdata_roaming: | |
| paths['Firefox'] = [appdata_roaming / 'Mozilla/Firefox/Profiles'] | |
| return paths | |
| def find_chromium_profiles(browser_base_path: Path) -> List[Tuple[str, Path]]: | |
| """Find all Chromium-based browser profile directories and their History files.""" | |
| profiles = [] | |
| if not browser_base_path.exists(): | |
| return profiles | |
| # Check Default profile | |
| default_history = browser_base_path / 'Default/History' | |
| if default_history.exists(): | |
| profiles.append(('Default', default_history)) | |
| # Check Profile X directories | |
| for profile_dir in browser_base_path.glob('Profile *'): | |
| if profile_dir.is_dir(): | |
| history_file = profile_dir / 'History' | |
| if history_file.exists(): | |
| profiles.append((profile_dir.name, history_file)) | |
| return profiles | |
| def find_firefox_profiles(firefox_base_path: Path) -> List[Tuple[str, Path]]: | |
| """Find all Firefox profile directories and their places.sqlite files.""" | |
| profiles = [] | |
| if not firefox_base_path.exists(): | |
| return profiles | |
| # Look for .default* or .default-release* directories | |
| for profile_dir in firefox_base_path.iterdir(): | |
| if profile_dir.is_dir() and ('default' in profile_dir.name.lower() or profile_dir.name.endswith('.default-release')): | |
| places_db = profile_dir / 'places.sqlite' | |
| if places_db.exists(): | |
| profiles.append((profile_dir.name, places_db)) | |
| return profiles | |
| def chromium_timestamp_to_iso(timestamp: int) -> str: | |
| """Convert Chromium timestamp (microseconds since 1601-01-01) to ISO 8601.""" | |
| if timestamp == 0: | |
| return '' | |
| try: | |
| # Chromium epoch: 1601-01-01 00:00:00 UTC | |
| # Convert to Unix epoch (1970-01-01) | |
| unix_timestamp = (timestamp - 11644473600000000) / 1000000 | |
| dt = datetime.fromtimestamp(unix_timestamp, tz=timezone.utc) | |
| return dt.isoformat() | |
| except (ValueError, OSError): | |
| return '' | |
| def firefox_timestamp_to_iso(timestamp: int) -> str: | |
| """Convert Firefox timestamp (microseconds or milliseconds since 1970-01-01) to ISO 8601.""" | |
| if timestamp == 0: | |
| return '' | |
| try: | |
| # Firefox uses microseconds in most cases, but handle milliseconds too | |
| if timestamp > 10000000000000000: # Likely microseconds | |
| unix_timestamp = timestamp / 1000000 | |
| elif timestamp > 10000000000: # Likely milliseconds | |
| unix_timestamp = timestamp / 1000 | |
| else: # Already in seconds | |
| unix_timestamp = timestamp | |
| dt = datetime.fromtimestamp(unix_timestamp, tz=timezone.utc) | |
| return dt.isoformat() | |
| except (ValueError, OSError): | |
| return '' | |
| def is_youtube_url(url: str) -> bool: | |
| """Check if URL is a YouTube watch/shorts/playlist URL.""" | |
| try: | |
| parsed = urlparse(url.lower()) | |
| domain = parsed.netloc.replace('www.', '') | |
| if domain in ['youtube.com', 'm.youtube.com']: | |
| return '/watch' in parsed.path or '/shorts' in parsed.path or '/playlist' in parsed.path | |
| elif domain == 'youtu.be': | |
| return True | |
| return False | |
| except: | |
| return False | |
| def is_search_url(url: str) -> bool: | |
| """Check if URL is a search engine query.""" | |
| try: | |
| parsed = urlparse(url.lower()) | |
| domain = parsed.netloc.replace('www.', '') | |
| # Google search | |
| if 'google' in domain and ('/search' in parsed.path or 'tbm=' in parsed.query): | |
| return 'q=' in parsed.query or 'query=' in parsed.query | |
| # Bing | |
| if 'bing.com' in domain and '/search' in parsed.path: | |
| return 'q=' in parsed.query | |
| # DuckDuckGo | |
| if 'duckduckgo.com' in domain: | |
| return 'q=' in parsed.query | |
| # Yahoo | |
| if 'yahoo.com' in domain and '/search' in parsed.path: | |
| return 'p=' in parsed.query | |
| return False | |
| except: | |
| return False | |
| def extract_search_query(url: str) -> str: | |
| """Extract search query from search engine URL.""" | |
| try: | |
| parsed = urlparse(url) | |
| params = parse_qs(parsed.query) | |
| # Try common query parameter names | |
| for param in ['q', 'p', 'query']: | |
| if param in params and params[param]: | |
| return params[param][0] | |
| return '' | |
| except: | |
| return '' | |
| def export_chromium_history(db_path: Path, browser: str, profile: str, | |
| filter_type: Optional[str], since_date: Optional[datetime], | |
| limit: Optional[int]) -> List[Dict]: | |
| """Export history from Chromium-based browser database.""" | |
| rows = [] | |
| # Copy database to temp file to avoid locks | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.db') as tmp_file: | |
| temp_db = tmp_file.name | |
| try: | |
| shutil.copy2(db_path, temp_db) | |
| conn = sqlite3.connect(temp_db) | |
| cursor = conn.cursor() | |
| # Build query | |
| query = "SELECT url, title, visit_count, last_visit_time FROM urls" | |
| conditions = [] | |
| params = [] | |
| if since_date: | |
| # Convert since_date to Chromium timestamp | |
| unix_ts = since_date.timestamp() | |
| chromium_ts = int((unix_ts + 11644473600) * 1000000) | |
| conditions.append("last_visit_time >= ?") | |
| params.append(chromium_ts) | |
| if conditions: | |
| query += " WHERE " + " AND ".join(conditions) | |
| query += " ORDER BY last_visit_time DESC" | |
| if limit: | |
| query += f" LIMIT {limit}" | |
| cursor.execute(query, params) | |
| for row in cursor.fetchall(): | |
| url, title, visit_count, last_visit_time = row | |
| # Apply filters | |
| if filter_type == 'youtube' and not is_youtube_url(url): | |
| continue | |
| if filter_type == 'search' and not is_search_url(url): | |
| continue | |
| search_query = '' | |
| if filter_type == 'search' and is_search_url(url): | |
| search_query = extract_search_query(url) | |
| rows.append({ | |
| 'Browser': browser, | |
| 'Profile': profile, | |
| 'URL': url, | |
| 'Title': title or '', | |
| 'Visit Count': visit_count or 0, | |
| 'Last Visit Time (ISO)': chromium_timestamp_to_iso(last_visit_time), | |
| 'Search Query': search_query | |
| }) | |
| conn.close() | |
| except Exception as e: | |
| print(f"Error reading {browser} ({profile}): {e}", file=sys.stderr) | |
| finally: | |
| try: | |
| os.unlink(temp_db) | |
| except: | |
| pass | |
| return rows | |
| def export_firefox_history(db_path: Path, profile: str, | |
| filter_type: Optional[str], since_date: Optional[datetime], | |
| limit: Optional[int]) -> List[Dict]: | |
| """Export history from Firefox database.""" | |
| rows = [] | |
| # Copy database to temp file to avoid locks | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.db') as tmp_file: | |
| temp_db = tmp_file.name | |
| try: | |
| shutil.copy2(db_path, temp_db) | |
| conn = sqlite3.connect(temp_db) | |
| cursor = conn.cursor() | |
| # Check if moz_historyvisits table exists for more accurate timestamps | |
| cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='moz_historyvisits'") | |
| has_history_visits = cursor.fetchone() is not None | |
| # Build query - prefer moz_historyvisits join for accurate visit times | |
| if has_history_visits: | |
| query = """ | |
| SELECT DISTINCT p.url, p.title, p.visit_count, | |
| MAX(h.visit_date) as last_visit_date | |
| FROM moz_places p | |
| LEFT JOIN moz_historyvisits h ON p.id = h.place_id | |
| WHERE p.url IS NOT NULL | |
| """ | |
| else: | |
| query = """ | |
| SELECT url, title, visit_count, last_visit_date | |
| FROM moz_places | |
| WHERE url IS NOT NULL | |
| """ | |
| conditions = [] | |
| params = [] | |
| if since_date: | |
| # Convert since_date to Firefox timestamp (microseconds) | |
| firefox_ts = int(since_date.timestamp() * 1000000) | |
| conditions.append("last_visit_date >= ?") | |
| params.append(firefox_ts) | |
| if conditions: | |
| query += " AND " + " AND ".join(conditions) | |
| if has_history_visits: | |
| query += " GROUP BY p.id" | |
| query += " ORDER BY last_visit_date DESC" | |
| if limit: | |
| query += f" LIMIT {limit}" | |
| cursor.execute(query, params) | |
| for row in cursor.fetchall(): | |
| url, title, visit_count, last_visit_date = row | |
| # Apply filters | |
| if filter_type == 'youtube' and not is_youtube_url(url): | |
| continue | |
| if filter_type == 'search' and not is_search_url(url): | |
| continue | |
| search_query = '' | |
| if filter_type == 'search' and is_search_url(url): | |
| search_query = extract_search_query(url) | |
| rows.append({ | |
| 'Browser': 'Firefox', | |
| 'Profile': profile, | |
| 'URL': url, | |
| 'Title': title or '', | |
| 'Visit Count': visit_count or 0, | |
| 'Last Visit Time (ISO)': firefox_timestamp_to_iso(last_visit_date) if last_visit_date else '', | |
| 'Search Query': search_query | |
| }) | |
| conn.close() | |
| except Exception as e: | |
| print(f"Error reading Firefox ({profile}): {e}", file=sys.stderr) | |
| finally: | |
| try: | |
| os.unlink(temp_db) | |
| except: | |
| pass | |
| return rows | |
| def write_csv(rows: List[Dict], output_path: Path, include_search_query: bool): | |
| """Write rows to CSV file.""" | |
| if not rows: | |
| return | |
| fieldnames = ['Browser', 'Profile', 'URL', 'Title', 'Visit Count', 'Last Visit Time (ISO)'] | |
| if include_search_query: | |
| fieldnames.append('Search Query') | |
| with open(output_path, 'w', encoding='utf-8', newline='') as f: | |
| writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore') | |
| writer.writeheader() | |
| writer.writerows(rows) | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description='Export browsing history from Chrome, Chromium, Brave, Edge, and Firefox', | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| %(prog)s | |
| %(prog)s --merge | |
| %(prog)s --only youtube --merge | |
| %(prog)s --only search --output-dir ./searches | |
| %(prog)s --since 2024-01-01 --merge | |
| %(prog)s --limit 100 --merge | |
| """ | |
| ) | |
| parser.add_argument('--output-dir', type=str, default='.', | |
| help='Output directory for CSV files (default: current directory)') | |
| parser.add_argument('--merge', action='store_true', | |
| help='Merge all history into a single CSV file') | |
| parser.add_argument('--only', choices=['youtube', 'search'], | |
| help='Export only YouTube URLs or search engine queries') | |
| parser.add_argument('--since', type=str, | |
| help='Export only visits after this date (YYYY-MM-DD)') | |
| parser.add_argument('--limit', type=int, | |
| help='Limit number of rows per output file (for testing)') | |
| args = parser.parse_args() | |
| # Parse since date | |
| since_date = None | |
| if args.since: | |
| try: | |
| since_date = datetime.strptime(args.since, '%Y-%m-%d').replace(tzinfo=timezone.utc) | |
| except ValueError: | |
| print(f"Error: Invalid date format '{args.since}'. Use YYYY-MM-DD", file=sys.stderr) | |
| sys.exit(1) | |
| # Create output directory | |
| output_dir = Path(args.output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Get browser paths | |
| browser_paths = get_browser_paths() | |
| all_rows = [] | |
| file_count = 0 | |
| # Process Chromium-based browsers | |
| for browser in ['Chrome', 'Chromium', 'Brave', 'Edge']: | |
| for base_path in browser_paths[browser]: | |
| profiles = find_chromium_profiles(base_path) | |
| for profile_name, db_path in profiles: | |
| print(f"Found {browser} profile: {profile_name} at {db_path}") | |
| rows = export_chromium_history( | |
| db_path, browser, profile_name, | |
| args.only, since_date, args.limit | |
| ) | |
| if rows: | |
| if args.merge: | |
| all_rows.extend(rows) | |
| else: | |
| # Write separate file | |
| filename = f"{browser.lower()}_{profile_name.lower()}_history.csv" | |
| output_path = output_dir / filename | |
| write_csv(rows, output_path, args.only == 'search') | |
| print(f"Exported {len(rows)} rows to {output_path}") | |
| file_count += 1 | |
| # Process Firefox | |
| for base_path in browser_paths['Firefox']: | |
| profiles = find_firefox_profiles(base_path) | |
| for profile_name, db_path in profiles: | |
| print(f"Found Firefox profile: {profile_name} at {db_path}") | |
| rows = export_firefox_history( | |
| db_path, profile_name, | |
| args.only, since_date, args.limit | |
| ) | |
| if rows: | |
| if args.merge: | |
| all_rows.extend(rows) | |
| else: | |
| # Write separate file | |
| filename = f"firefox_{profile_name.lower()}_history.csv" | |
| output_path = output_dir / filename | |
| write_csv(rows, output_path, args.only == 'search') | |
| print(f"Exported {len(rows)} rows to {output_path}") | |
| file_count += 1 | |
| # Write merged file if requested | |
| if args.merge and all_rows: | |
| output_path = output_dir / 'merged_browser_history.csv' | |
| write_csv(all_rows, output_path, args.only == 'search') | |
| print(f"\nMerged export: {len(all_rows)} total rows to {output_path}") | |
| file_count = 1 | |
| # Summary | |
| print(f"\n{'='*60}") | |
| print(f"Export complete!") | |
| print(f"Files created: {file_count}") | |
| print(f"Total rows exported: {len(all_rows) if args.merge else 'See individual files'}") | |
| print(f"{'='*60}") | |
| if __name__ == '__main__': | |
| main() |