import re import csv import json import time import random import requests import feedparser from bs4 import BeautifulSoup from urllib.parse import quote from datetime import datetime def extract_data_from_publication_info(publication_info): authors = "" journal = "" year = "" publisher = "" regex_pattern = r"(.+?)\s+-\s+(.+?),\s+(\d{4})\s+-\s+(.+)$" match = re.match(regex_pattern, publication_info) if match: authors = match.group(1).strip() journal = match.group(2).strip() year = match.group(3).strip() publisher = match.group(4).strip() else: authors = "Unknown" journal = "Unknown" year = "Unknown" publisher = "Unknown" return { "authors": authors, "journal": journal, "year": year, "publisher": publisher } def scrape_gg_scholar(query, num_pages=1, start_year=None, end_year=None): results = [] header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } base_url = "https://scholar.google.com/scholar?" params = { "q": query.replace(" ", "+"), "hl": "en", "as_sdt": "0,5" } if start_year and end_year: params["as_ylo"] = start_year params["as_yhi"] = end_year for pages in range(num_pages): start = pages * 10 params["start"] = start url_params = "&".join([f"{k}={v}" for k, v in params.items()]) url = base_url + url_params try: response = requests.get(url, headers=header) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") articles = soup.select(".gs_r.gs_or.gs_scl") for article in articles: title_element = article.select_one(".gs_rt a") if title_element: title = title_element.text link = title_element["href"] abstract_element = article.select_one(".gs_rs") abstract = abstract_element.text if abstract_element else "" abstract = abstract.replace("…", "").strip() abstract = abstract.replace("\n", "").strip() abstract = " ".join(abstract.split()) pub_info_element = article.select_one(".gs_a") pub_info = pub_info_element.text if pub_info_element else "" pub_info_parsed = extract_data_from_publication_info(pub_info) results.append({ "GGS_title": title, "GGS_link": link, "GGS_brief_abstract": abstract, "GGS_publication_info": pub_info, "GGS_authors": pub_info_parsed["authors"], "GGS_journal": pub_info_parsed["journal"], "GGS_year": pub_info_parsed["year"], "GGS_publisher": pub_info_parsed["publisher"] }) time.sleep(random.uniform(1, 3)) # Random sleep to avoid being blocked else: print(f"ERROR: STATUS CODE {response.status_code}") break except Exception as e: print(f"An error occurred: {e}") break return results def save_to_csv(data, filename="base_crawling_from_gg_scholar.csv"): with open(filename, "w", newline="", encoding="utf-8-sig") as csvfile: fieldnames = data[0].keys() writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for row in data: writer.writerow(row) print(f"Data saved to {filename}") def main(): query = input("Enter your search query: ") num_pages = int(input("Enter the number of pages to scrape: ")) use_time_filter = input("Do you want to filter by year? (y/n): ").strip().lower() start_year = None end_year = None if use_time_filter == 'y': start_year = input("Enter the start year (format: YYYY; for example: 2020): ") end_year = input("Enter the end year (format: YYYY; for example: 2025): ") results = scrape_gg_scholar(query, num_pages, start_year, end_year) print(f"Found {len(results)} results.") for i, result in enumerate(results): print(f"{i + 1}. {result['title']}") print(f" Link: {result['link']}") print(f" Brief Abstract: {result['brief_abstract']}") print(f" Publication Info: {result['publication_info']}") print(f" Authors: {result['authors']}") print(f" Journal: {result['journal']}") print(f" Year: {result['year']}") print(f" Publisher: {result['publisher']}") print("=" * 100) save_option = input("Do you want to save the results to a CSV file? (y/n): ").strip().lower() if save_option == 'y': file_name = input("Enter the filename (default file name is 'base_crawling_from_gg_scholar.csv', enter fine name without extension): ") save_to_csv(results) if __name__ == "__main__": main() # The code is designed to scrape Google Scholar for academic articles based on a search query.