import re
import csv
import json
import time
import random
import requests
import feedparser
from bs4 import BeautifulSoup
from urllib.parse import quote
from datetime import datetime

def extract_data_from_publication_info(publication_info):

    authors = ""
    journal = ""
    year = ""
    publisher = ""

    regex_pattern = r"(.+?)\s+-\s+(.+?),\s+(\d{4})\s+-\s+(.+)$"

    match = re.match(regex_pattern, publication_info)

    if match:
        authors = match.group(1).strip()
        journal = match.group(2).strip()
        year = match.group(3).strip()
        publisher = match.group(4).strip()
    else:
        authors = "Unknown"
        journal = "Unknown"
        year = "Unknown"
        publisher = "Unknown"

    return {
        "authors": authors,
        "journal": journal,
        "year": year,
        "publisher": publisher
    }

def scrape_gg_scholar(query, num_pages=1, start_year=None, end_year=None):
    results = []

    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }

    base_url = "https://scholar.google.com/scholar?"

    params = {
        "q": query.replace(" ", "+"),
        "hl": "en",
        "as_sdt": "0,5"
    }

    if start_year and end_year:
        params["as_ylo"] = start_year
        params["as_yhi"] = end_year

    for pages in range(num_pages):
        start = pages * 10
        params["start"] = start

        url_params = "&".join([f"{k}={v}" for k, v in params.items()])

        url = base_url + url_params

        try:
            response = requests.get(url, headers=header)

            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")

                articles = soup.select(".gs_r.gs_or.gs_scl")

                for article in articles:
                    title_element = article.select_one(".gs_rt a")

                    if title_element:
                        title = title_element.text
                        link = title_element["href"]

                        abstract_element = article.select_one(".gs_rs")
                        abstract = abstract_element.text if abstract_element else ""
                        abstract = abstract.replace("…", "").strip()
                        abstract = abstract.replace("\n", "").strip()
                        abstract = " ".join(abstract.split())

                        pub_info_element = article.select_one(".gs_a")
                        pub_info = pub_info_element.text if pub_info_element else ""

                        pub_info_parsed = extract_data_from_publication_info(pub_info)

                        results.append({
                            "GGS_title": title,
                            "GGS_link": link,
                            "GGS_brief_abstract": abstract,
                            "GGS_publication_info": pub_info,
                            "GGS_authors": pub_info_parsed["authors"],
                            "GGS_journal": pub_info_parsed["journal"],
                            "GGS_year": pub_info_parsed["year"],
                            "GGS_publisher": pub_info_parsed["publisher"]
                        })

                time.sleep(random.uniform(1, 3))  # Random sleep to avoid being blocked

            else:
                print(f"ERROR: STATUS CODE {response.status_code}")
                break
        
        except Exception as e:
            print(f"An error occurred: {e}")
            break

    return results

def save_to_csv(data, filename="base_crawling_from_gg_scholar.csv"):
    with open(filename, "w", newline="", encoding="utf-8-sig") as csvfile:
        fieldnames = data[0].keys()
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"Data saved to {filename}")

def main():
    query = input("Enter your search query: ")
    num_pages = int(input("Enter the number of pages to scrape: "))
    
    use_time_filter = input("Do you want to filter by year? (y/n): ").strip().lower()
    start_year = None
    end_year = None

    if use_time_filter == 'y':
        start_year = input("Enter the start year (format: YYYY; for example: 2020): ")
        end_year = input("Enter the end year (format: YYYY; for example: 2025): ")

    results = scrape_gg_scholar(query, num_pages, start_year, end_year)

    print(f"Found {len(results)} results.")

    for i, result in enumerate(results):
        print(f"{i + 1}. {result['title']}")
        print(f"   Link: {result['link']}")
        print(f"   Brief Abstract: {result['brief_abstract']}")
        print(f"   Publication Info: {result['publication_info']}")
        print(f"   Authors: {result['authors']}")
        print(f"   Journal: {result['journal']}")
        print(f"   Year: {result['year']}")
        print(f"   Publisher: {result['publisher']}")
        print("=" * 100)

    save_option = input("Do you want to save the results to a CSV file? (y/n): ").strip().lower()
    if save_option == 'y':
        file_name = input("Enter the filename (default file name is 'base_crawling_from_gg_scholar.csv', enter fine name without extension): ")
        save_to_csv(results)

if __name__ == "__main__":
    main()
# The code is designed to scrape Google Scholar for academic articles based on a search query.