edutech / search_history_clude.py
RajatMalviya's picture
Upload search_history_clude.py
2762376 verified
#!/usr/bin/env python3
"""
export_browser_history.py
Cross-platform script to export browsing history from Chromium-based browsers
(Chrome, Chromium, Brave, Edge) and Firefox to CSV format.
Usage examples:
# Export all history to separate CSV files
python export_browser_history.py
# Export all history to a single merged CSV file
python export_browser_history.py --merge
# Export only YouTube watch history
python export_browser_history.py --only youtube --merge
# Export only search engine queries
python export_browser_history.py --only search --output-dir ./searches
# Export history since a specific date
python export_browser_history.py --since 2024-01-01 --merge
# Export with row limit (for testing)
python export_browser_history.py --limit 100 --merge
"""
import argparse
import csv
import os
import platform
import shutil
import sqlite3
import sys
import tempfile
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from urllib.parse import urlparse, parse_qs
# Browser database path configurations
def get_browser_paths() -> Dict[str, List[Path]]:
"""Return potential browser history database paths for the current OS."""
system = platform.system()
home = Path.home()
paths = {
'Chrome': [],
'Chromium': [],
'Brave': [],
'Edge': [],
'Firefox': []
}
if system == 'Linux':
paths['Chrome'] = [home / '.config/google-chrome']
paths['Chromium'] = [home / '.config/chromium']
paths['Brave'] = [home / '.config/BraveSoftware/Brave-Browser']
paths['Edge'] = [home / '.config/microsoft-edge']
paths['Firefox'] = [home / '.mozilla/firefox']
elif system == 'Darwin': # macOS
paths['Chrome'] = [home / 'Library/Application Support/Google/Chrome']
paths['Chromium'] = [home / 'Library/Application Support/Chromium']
paths['Brave'] = [home / 'Library/Application Support/BraveSoftware/Brave-Browser']
paths['Edge'] = [home / 'Library/Application Support/Microsoft Edge']
paths['Firefox'] = [home / 'Library/Application Support/Firefox/Profiles']
elif system == 'Windows':
appdata = Path(os.getenv('LOCALAPPDATA', ''))
appdata_roaming = Path(os.getenv('APPDATA', ''))
if appdata:
paths['Chrome'] = [appdata / 'Google/Chrome/User Data']
paths['Chromium'] = [appdata / 'Chromium/User Data']
paths['Brave'] = [appdata / 'BraveSoftware/Brave-Browser/User Data']
paths['Edge'] = [appdata / 'Microsoft/Edge/User Data']
if appdata_roaming:
paths['Firefox'] = [appdata_roaming / 'Mozilla/Firefox/Profiles']
return paths
def find_chromium_profiles(browser_base_path: Path) -> List[Tuple[str, Path]]:
"""Find all Chromium-based browser profile directories and their History files."""
profiles = []
if not browser_base_path.exists():
return profiles
# Check Default profile
default_history = browser_base_path / 'Default/History'
if default_history.exists():
profiles.append(('Default', default_history))
# Check Profile X directories
for profile_dir in browser_base_path.glob('Profile *'):
if profile_dir.is_dir():
history_file = profile_dir / 'History'
if history_file.exists():
profiles.append((profile_dir.name, history_file))
return profiles
def find_firefox_profiles(firefox_base_path: Path) -> List[Tuple[str, Path]]:
"""Find all Firefox profile directories and their places.sqlite files."""
profiles = []
if not firefox_base_path.exists():
return profiles
# Look for .default* or .default-release* directories
for profile_dir in firefox_base_path.iterdir():
if profile_dir.is_dir() and ('default' in profile_dir.name.lower() or profile_dir.name.endswith('.default-release')):
places_db = profile_dir / 'places.sqlite'
if places_db.exists():
profiles.append((profile_dir.name, places_db))
return profiles
def chromium_timestamp_to_iso(timestamp: int) -> str:
"""Convert Chromium timestamp (microseconds since 1601-01-01) to ISO 8601."""
if timestamp == 0:
return ''
try:
# Chromium epoch: 1601-01-01 00:00:00 UTC
# Convert to Unix epoch (1970-01-01)
unix_timestamp = (timestamp - 11644473600000000) / 1000000
dt = datetime.fromtimestamp(unix_timestamp, tz=timezone.utc)
return dt.isoformat()
except (ValueError, OSError):
return ''
def firefox_timestamp_to_iso(timestamp: int) -> str:
"""Convert Firefox timestamp (microseconds or milliseconds since 1970-01-01) to ISO 8601."""
if timestamp == 0:
return ''
try:
# Firefox uses microseconds in most cases, but handle milliseconds too
if timestamp > 10000000000000000: # Likely microseconds
unix_timestamp = timestamp / 1000000
elif timestamp > 10000000000: # Likely milliseconds
unix_timestamp = timestamp / 1000
else: # Already in seconds
unix_timestamp = timestamp
dt = datetime.fromtimestamp(unix_timestamp, tz=timezone.utc)
return dt.isoformat()
except (ValueError, OSError):
return ''
def is_youtube_url(url: str) -> bool:
"""Check if URL is a YouTube watch/shorts/playlist URL."""
try:
parsed = urlparse(url.lower())
domain = parsed.netloc.replace('www.', '')
if domain in ['youtube.com', 'm.youtube.com']:
return '/watch' in parsed.path or '/shorts' in parsed.path or '/playlist' in parsed.path
elif domain == 'youtu.be':
return True
return False
except:
return False
def is_search_url(url: str) -> bool:
"""Check if URL is a search engine query."""
try:
parsed = urlparse(url.lower())
domain = parsed.netloc.replace('www.', '')
# Google search
if 'google' in domain and ('/search' in parsed.path or 'tbm=' in parsed.query):
return 'q=' in parsed.query or 'query=' in parsed.query
# Bing
if 'bing.com' in domain and '/search' in parsed.path:
return 'q=' in parsed.query
# DuckDuckGo
if 'duckduckgo.com' in domain:
return 'q=' in parsed.query
# Yahoo
if 'yahoo.com' in domain and '/search' in parsed.path:
return 'p=' in parsed.query
return False
except:
return False
def extract_search_query(url: str) -> str:
"""Extract search query from search engine URL."""
try:
parsed = urlparse(url)
params = parse_qs(parsed.query)
# Try common query parameter names
for param in ['q', 'p', 'query']:
if param in params and params[param]:
return params[param][0]
return ''
except:
return ''
def export_chromium_history(db_path: Path, browser: str, profile: str,
filter_type: Optional[str], since_date: Optional[datetime],
limit: Optional[int]) -> List[Dict]:
"""Export history from Chromium-based browser database."""
rows = []
# Copy database to temp file to avoid locks
with tempfile.NamedTemporaryFile(delete=False, suffix='.db') as tmp_file:
temp_db = tmp_file.name
try:
shutil.copy2(db_path, temp_db)
conn = sqlite3.connect(temp_db)
cursor = conn.cursor()
# Build query
query = "SELECT url, title, visit_count, last_visit_time FROM urls"
conditions = []
params = []
if since_date:
# Convert since_date to Chromium timestamp
unix_ts = since_date.timestamp()
chromium_ts = int((unix_ts + 11644473600) * 1000000)
conditions.append("last_visit_time >= ?")
params.append(chromium_ts)
if conditions:
query += " WHERE " + " AND ".join(conditions)
query += " ORDER BY last_visit_time DESC"
if limit:
query += f" LIMIT {limit}"
cursor.execute(query, params)
for row in cursor.fetchall():
url, title, visit_count, last_visit_time = row
# Apply filters
if filter_type == 'youtube' and not is_youtube_url(url):
continue
if filter_type == 'search' and not is_search_url(url):
continue
search_query = ''
if filter_type == 'search' and is_search_url(url):
search_query = extract_search_query(url)
rows.append({
'Browser': browser,
'Profile': profile,
'URL': url,
'Title': title or '',
'Visit Count': visit_count or 0,
'Last Visit Time (ISO)': chromium_timestamp_to_iso(last_visit_time),
'Search Query': search_query
})
conn.close()
except Exception as e:
print(f"Error reading {browser} ({profile}): {e}", file=sys.stderr)
finally:
try:
os.unlink(temp_db)
except:
pass
return rows
def export_firefox_history(db_path: Path, profile: str,
filter_type: Optional[str], since_date: Optional[datetime],
limit: Optional[int]) -> List[Dict]:
"""Export history from Firefox database."""
rows = []
# Copy database to temp file to avoid locks
with tempfile.NamedTemporaryFile(delete=False, suffix='.db') as tmp_file:
temp_db = tmp_file.name
try:
shutil.copy2(db_path, temp_db)
conn = sqlite3.connect(temp_db)
cursor = conn.cursor()
# Check if moz_historyvisits table exists for more accurate timestamps
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='moz_historyvisits'")
has_history_visits = cursor.fetchone() is not None
# Build query - prefer moz_historyvisits join for accurate visit times
if has_history_visits:
query = """
SELECT DISTINCT p.url, p.title, p.visit_count,
MAX(h.visit_date) as last_visit_date
FROM moz_places p
LEFT JOIN moz_historyvisits h ON p.id = h.place_id
WHERE p.url IS NOT NULL
"""
else:
query = """
SELECT url, title, visit_count, last_visit_date
FROM moz_places
WHERE url IS NOT NULL
"""
conditions = []
params = []
if since_date:
# Convert since_date to Firefox timestamp (microseconds)
firefox_ts = int(since_date.timestamp() * 1000000)
conditions.append("last_visit_date >= ?")
params.append(firefox_ts)
if conditions:
query += " AND " + " AND ".join(conditions)
if has_history_visits:
query += " GROUP BY p.id"
query += " ORDER BY last_visit_date DESC"
if limit:
query += f" LIMIT {limit}"
cursor.execute(query, params)
for row in cursor.fetchall():
url, title, visit_count, last_visit_date = row
# Apply filters
if filter_type == 'youtube' and not is_youtube_url(url):
continue
if filter_type == 'search' and not is_search_url(url):
continue
search_query = ''
if filter_type == 'search' and is_search_url(url):
search_query = extract_search_query(url)
rows.append({
'Browser': 'Firefox',
'Profile': profile,
'URL': url,
'Title': title or '',
'Visit Count': visit_count or 0,
'Last Visit Time (ISO)': firefox_timestamp_to_iso(last_visit_date) if last_visit_date else '',
'Search Query': search_query
})
conn.close()
except Exception as e:
print(f"Error reading Firefox ({profile}): {e}", file=sys.stderr)
finally:
try:
os.unlink(temp_db)
except:
pass
return rows
def write_csv(rows: List[Dict], output_path: Path, include_search_query: bool):
"""Write rows to CSV file."""
if not rows:
return
fieldnames = ['Browser', 'Profile', 'URL', 'Title', 'Visit Count', 'Last Visit Time (ISO)']
if include_search_query:
fieldnames.append('Search Query')
with open(output_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
writer.writeheader()
writer.writerows(rows)
def main():
parser = argparse.ArgumentParser(
description='Export browsing history from Chrome, Chromium, Brave, Edge, and Firefox',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s
%(prog)s --merge
%(prog)s --only youtube --merge
%(prog)s --only search --output-dir ./searches
%(prog)s --since 2024-01-01 --merge
%(prog)s --limit 100 --merge
"""
)
parser.add_argument('--output-dir', type=str, default='.',
help='Output directory for CSV files (default: current directory)')
parser.add_argument('--merge', action='store_true',
help='Merge all history into a single CSV file')
parser.add_argument('--only', choices=['youtube', 'search'],
help='Export only YouTube URLs or search engine queries')
parser.add_argument('--since', type=str,
help='Export only visits after this date (YYYY-MM-DD)')
parser.add_argument('--limit', type=int,
help='Limit number of rows per output file (for testing)')
args = parser.parse_args()
# Parse since date
since_date = None
if args.since:
try:
since_date = datetime.strptime(args.since, '%Y-%m-%d').replace(tzinfo=timezone.utc)
except ValueError:
print(f"Error: Invalid date format '{args.since}'. Use YYYY-MM-DD", file=sys.stderr)
sys.exit(1)
# Create output directory
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Get browser paths
browser_paths = get_browser_paths()
all_rows = []
file_count = 0
# Process Chromium-based browsers
for browser in ['Chrome', 'Chromium', 'Brave', 'Edge']:
for base_path in browser_paths[browser]:
profiles = find_chromium_profiles(base_path)
for profile_name, db_path in profiles:
print(f"Found {browser} profile: {profile_name} at {db_path}")
rows = export_chromium_history(
db_path, browser, profile_name,
args.only, since_date, args.limit
)
if rows:
if args.merge:
all_rows.extend(rows)
else:
# Write separate file
filename = f"{browser.lower()}_{profile_name.lower()}_history.csv"
output_path = output_dir / filename
write_csv(rows, output_path, args.only == 'search')
print(f"Exported {len(rows)} rows to {output_path}")
file_count += 1
# Process Firefox
for base_path in browser_paths['Firefox']:
profiles = find_firefox_profiles(base_path)
for profile_name, db_path in profiles:
print(f"Found Firefox profile: {profile_name} at {db_path}")
rows = export_firefox_history(
db_path, profile_name,
args.only, since_date, args.limit
)
if rows:
if args.merge:
all_rows.extend(rows)
else:
# Write separate file
filename = f"firefox_{profile_name.lower()}_history.csv"
output_path = output_dir / filename
write_csv(rows, output_path, args.only == 'search')
print(f"Exported {len(rows)} rows to {output_path}")
file_count += 1
# Write merged file if requested
if args.merge and all_rows:
output_path = output_dir / 'merged_browser_history.csv'
write_csv(all_rows, output_path, args.only == 'search')
print(f"\nMerged export: {len(all_rows)} total rows to {output_path}")
file_count = 1
# Summary
print(f"\n{'='*60}")
print(f"Export complete!")
print(f"Files created: {file_count}")
print(f"Total rows exported: {len(all_rows) if args.merge else 'See individual files'}")
print(f"{'='*60}")
if __name__ == '__main__':
main()