Spaces:
Sleeping
Sleeping
| import re | |
| import csv | |
| import json | |
| import time | |
| import random | |
| import requests | |
| import feedparser | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import quote | |
| from datetime import datetime | |
| def extract_data_from_publication_info(publication_info): | |
| authors = "" | |
| journal = "" | |
| year = "" | |
| publisher = "" | |
| regex_pattern = r"(.+?)\s+-\s+(.+?),\s+(\d{4})\s+-\s+(.+)$" | |
| match = re.match(regex_pattern, publication_info) | |
| if match: | |
| authors = match.group(1).strip() | |
| journal = match.group(2).strip() | |
| year = match.group(3).strip() | |
| publisher = match.group(4).strip() | |
| else: | |
| authors = "Unknown" | |
| journal = "Unknown" | |
| year = "Unknown" | |
| publisher = "Unknown" | |
| return { | |
| "authors": authors, | |
| "journal": journal, | |
| "year": year, | |
| "publisher": publisher | |
| } | |
| def scrape_gg_scholar(query, num_pages=1, start_year=None, end_year=None): | |
| results = [] | |
| header = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" | |
| } | |
| base_url = "https://scholar.google.com/scholar?" | |
| params = { | |
| "q": query.replace(" ", "+"), | |
| "hl": "en", | |
| "as_sdt": "0,5" | |
| } | |
| if start_year and end_year: | |
| params["as_ylo"] = start_year | |
| params["as_yhi"] = end_year | |
| for pages in range(num_pages): | |
| start = pages * 10 | |
| params["start"] = start | |
| url_params = "&".join([f"{k}={v}" for k, v in params.items()]) | |
| url = base_url + url_params | |
| try: | |
| response = requests.get(url, headers=header) | |
| if response.status_code == 200: | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| articles = soup.select(".gs_r.gs_or.gs_scl") | |
| for article in articles: | |
| title_element = article.select_one(".gs_rt a") | |
| if title_element: | |
| title = title_element.text | |
| link = title_element["href"] | |
| abstract_element = article.select_one(".gs_rs") | |
| abstract = abstract_element.text if abstract_element else "" | |
| abstract = abstract.replace("…", "").strip() | |
| abstract = abstract.replace("\n", "").strip() | |
| abstract = " ".join(abstract.split()) | |
| pub_info_element = article.select_one(".gs_a") | |
| pub_info = pub_info_element.text if pub_info_element else "" | |
| pub_info_parsed = extract_data_from_publication_info(pub_info) | |
| results.append({ | |
| "GGS_title": title, | |
| "GGS_link": link, | |
| "GGS_brief_abstract": abstract, | |
| "GGS_publication_info": pub_info, | |
| "GGS_authors": pub_info_parsed["authors"], | |
| "GGS_journal": pub_info_parsed["journal"], | |
| "GGS_year": pub_info_parsed["year"], | |
| "GGS_publisher": pub_info_parsed["publisher"] | |
| }) | |
| time.sleep(random.uniform(1, 3)) # Random sleep to avoid being blocked | |
| else: | |
| print(f"ERROR: STATUS CODE {response.status_code}") | |
| break | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| break | |
| return results | |
| def save_to_csv(data, filename="base_crawling_from_gg_scholar.csv"): | |
| with open(filename, "w", newline="", encoding="utf-8-sig") as csvfile: | |
| fieldnames = data[0].keys() | |
| writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
| writer.writeheader() | |
| for row in data: | |
| writer.writerow(row) | |
| print(f"Data saved to {filename}") | |
| def main(): | |
| query = input("Enter your search query: ") | |
| num_pages = int(input("Enter the number of pages to scrape: ")) | |
| use_time_filter = input("Do you want to filter by year? (y/n): ").strip().lower() | |
| start_year = None | |
| end_year = None | |
| if use_time_filter == 'y': | |
| start_year = input("Enter the start year (format: YYYY; for example: 2020): ") | |
| end_year = input("Enter the end year (format: YYYY; for example: 2025): ") | |
| results = scrape_gg_scholar(query, num_pages, start_year, end_year) | |
| print(f"Found {len(results)} results.") | |
| for i, result in enumerate(results): | |
| print(f"{i + 1}. {result['title']}") | |
| print(f" Link: {result['link']}") | |
| print(f" Brief Abstract: {result['brief_abstract']}") | |
| print(f" Publication Info: {result['publication_info']}") | |
| print(f" Authors: {result['authors']}") | |
| print(f" Journal: {result['journal']}") | |
| print(f" Year: {result['year']}") | |
| print(f" Publisher: {result['publisher']}") | |
| print("=" * 100) | |
| save_option = input("Do you want to save the results to a CSV file? (y/n): ").strip().lower() | |
| if save_option == 'y': | |
| file_name = input("Enter the filename (default file name is 'base_crawling_from_gg_scholar.csv', enter fine name without extension): ") | |
| save_to_csv(results) | |
| if __name__ == "__main__": | |
| main() | |
| # The code is designed to scrape Google Scholar for academic articles based on a search query. |