"""Pulls papers from arxiv and gets their embeddings."""
from collections import defaultdict
from functools import partial
from datetime import datetime
import heapq
import json
import os
from pathlib import Path
import pickle

from datasets import Dataset
import kaggle
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

from core.recommender import EmbeddingProcessor
from data_pipeline.config import DataPaths

arxiv_fname = "arxiv-metadata-oai-snapshot.json"

def download_arxiv_data():
    """Downloads and unzips arxiv dataset from Kaggle into `data` directory."""
    dataset = "Cornell-University/arxiv"
    data_path = DataPaths.BASE_DIR

    if not any([DataPaths.ARXIV_FNAME in file for file in os.listdir(data_path)]):
        kaggle.api.dataset_download_cli(dataset, path=data_path, unzip=True)
    else:
        print(f"Data already downloaded at {DataPaths.ARXIV_FNAME}.")
    return DataPaths.ARXIV_FNAME

def get_lbl_from_name(names):
    """Tuple (last_name, first_name, middle_name) => String 'first_name [middle_name] last_name'."""
    return [
        name[1] + ' ' + name[0] if name[2] == '' \
        else name[1] + ' ' + name[2] + ' ' + name[0]
        for name in names
    ]

def filter_arxiv_for_ml(obtain_summary=False, authors_of_interest=None):
    """Sifts through downloaded arxiv file to find ML-related papers.
    
    If `obtain_summary` is True, saves a pickled DataFrame to the same directory as
    the downloaded arxiv file with the name `arxiv_fname` + `-summary.pkl`.

    If `authors_of_interest` is not None, only save ML-related papers by those authors.
    """
    ml_path = str(DataPaths.ARXIV_PATH).split('.')[0]+'-ml.json'
    summary_path = str(DataPaths.ARXIV_PATH).split('.')[0]+'-summary.pkl'

    ml_cats = ['cs.AI', 'cs.CL', 'cs.CV', 'cs.LG', 'stat.ML']

    if obtain_summary and Path(ml_path).exists() and Path(summary_path).exists():
        print(f"File {ml_path} with ML subset of arxiv already exists. Skipping.")
        print(f"Summary file {summary_path} already exists. Skipping.")
        return
    if not obtain_summary and Path(ml_path).exists():
        print(f"File {ml_path} with ML subset of arxiv already exists. Skipping.")
        return

    if obtain_summary:
        gdf = {'categories': [], 'lv_date': []}  # global data

    if authors_of_interest:
        authors_of_interest = set(authors_of_interest)

    # Load the JSON file line by line
    with open(DataPaths.ARXIV_PATH, 'r') as f1, open(ml_path, 'w') as f2:
        for line in tqdm(f1):
            # Parse each line as JSON
            try:
                entry_data = json.loads(line)
            except json.JSONDecodeError:
                # Skip lines that cannot be parsed as JSON
                continue

            # check categories and last version in entry data
            if (
                obtain_summary 
                and 'categories' in entry_data 
                and 'versions' in entry_data 
                and len(entry_data['versions']) 
                and 'created' in entry_data['versions'][-1]
            ):
                gdf['categories'].append(entry_data['categories'])
                gdf['lv_date'].append(entry_data['versions'][-1]['created'])

            # ml data
            authors_on_paper = get_lbl_from_name(entry_data['authors_parsed'])
            if ('categories' in entry_data
                and (any(cat in entry_data['categories'] for cat in ml_cats))
                and (any(author in authors_of_interest for author in authors_on_paper))
            ):
                f2.write(line)

    if obtain_summary:
        gdf = pd.DataFrame(gdf)
        gdf['lv_date'] = pd.to_datetime(gdf['lv_date'])
        gdf = gdf.sort_values('lv_date', axis=0).reset_index(drop=True)

        cats = set()
        for cat_combo in gdf['categories'].unique():
            cat_combo.split(' ')
            cats.update(cat_combo.split(' '))
        print(f'Columnizing {len(cats)} categories. ')
        for cat in tqdm(cats):
            gdf[cat] = pd.arrays.SparseArray(gdf['categories'].str.contains(cat), fill_value=0, dtype=np.int8)

        # count number of categories item is associated with
        gdf['ncats'] = gdf['categories'].str.count(' ') + 1

        # write to pickle file
        with open(summary_path, 'wb') as f:
            pickle.dump(gdf, f)

def get_professors_and_relevant_papers(us_professors, k=8, cutoff=datetime(2022, 10, 1)):
    """
    Returns a dictionary mapping U.S. professor names to a list of indices 
    corresponding to their most recent papers in DataPaths.ML_ARXIV_PATH.
    This function is necessary to specify the papers we are interested in for each
    professor (e.g., the most recent papers after cutoff)

    Parameters:
    - us_professors: A list of U.S. professor names to match against.
    - k: The number of most recent papers to keep for each professor, based on 
         the first version upload date.
    - cutoff (datetime): Only considers papers published after this date 
                         (default: October 1, 2022).
    
    Returns:
    - dict: A dictionary where keys are professor names and values are lists of 
            indices corresponding to their most recent papers.
    """
    # professors to tuple of (datetime, arxiv_id)
    p2p = defaultdict(list)

    with open(DataPaths.ML_ARXIV_PATH, 'r') as f:
        while True:
            line = f.readline()
            if not line: break

            try:
                ml_data = json.loads(line)
                paper_authors = get_lbl_from_name(ml_data['authors_parsed'])

                # filter the same way as in `conference_scraper.py`
                # ignore solo-authored papers and papers with more than 20 authors
                if len(paper_authors) == 1 or len(paper_authors) > 20:
                    continue

                try:
                    dt = datetime.strptime(ml_data["versions"][0]["created"], '%a, %d %b %Y %H:%M:%S %Z')
                    if dt < cutoff:
                        continue
                except (KeyError, ValueError) as e:
                    print(f"Failed to parse date: \n{ml_data}\nError: {e}")
                    dt = datetime(2000, 1, 1)  # before cutoff date

                # consider if professor is first-author since we now care about semantics
                for paper_author in paper_authors:
                    if paper_author in us_professors:
                        # make a connection
                        heapq.heappush(p2p[paper_author], (dt, ml_data["id"]))
                        if len(p2p[paper_author]) > k:
                            heapq.heappop(p2p[paper_author])
            except:
                print(f"{line}")
    return p2p

def gen(p2p):
    values = p2p.values()
    relevant_ids = set()
    for value in values:
        relevant_ids.update([v[1] for v in value])
    with open(DataPaths.ML_ARXIV_PATH, 'r') as f:
        while True:
            line = f.readline()
            if not line: break

            data = json.loads(line)
            if data["id"] in relevant_ids:
                authors = get_lbl_from_name(data["authors_parsed"])
                authors = [a for a in authors if a in p2p]  # keep authors who are U.S. professors

                yield {"id": data["id"],
                       "title": data["title"],
                       "abstract": data["abstract"], 
                       "authors": authors
                    }

def save_paper_to_professor(p2p, save_path):
    """Returns a dictionary mapping an Arxiv ID to U.S. professor names
    
    `p2p`: mapping from professor to list of paper indices in DataPaths.ML_ARXIV_PATH
    `ds`: dataset with Arxiv links and line_nbr
    """

    id2p = defaultdict(list)
    for professor, dt_and_ids in p2p.items():
        for _, id_ in dt_and_ids:
            id2p[id_].append(professor)

    save_dir = os.path.dirname(save_path)
    os.makedirs(save_dir, exist_ok=True)
    with open(save_path, 'w') as f:
        json.dump(id2p, f)
    return id2p


def main():
    """Downloads arxiv data and extract embeddings for papers."""

    ### Download and filter for ML papers written by U.S. professors ###
    # print("Downloading data...")
    # download_arxiv_data()
    # with open(DataPaths.US_PROF_PATH, 'r') as f:
    #     authors_of_interest = json.load(f)
    # authors_of_interest = [author['name'] for author in authors_of_interest]
    # print("Filtering data for ML papers...")
    # filter_arxiv_for_ml(authors_of_interest=authors_of_interest)

    # ### Create a dataset containing paper info, e.g., title, abstract, authors, etc. ###
    # print("Saving paper data to disk at " + DataPaths.PAPER_DATA_PATH)
    # p2p = get_professors_and_relevant_papers(authors_of_interest)
    # ds = Dataset.from_generator(partial(gen, p2p))
    # ds.save_to_disk(DataPaths.PAPER_DATA_PATH)

    # ### Extract paper embeddings ###
    # print("Extracting embeddings (use GPU if possible)...")
    # # Initialize the embedding processor with model names
    # embedding_processor = EmbeddingProcessor(
    #     model_name='sentence-transformers/all-mpnet-base-v2',
    #     custom_model_name='salsabiilashifa11/sbert-paper'
    # )
    # # Process dataset and save with embeddings
    # embedding_processor.process_dataset(DataPaths.PAPER_DATA_PATH, DataPaths.EMBD_PATH, batch_size=128)

    ### Create front-end data ###

    # Filter ds for paper title, id, authors, and embedding
    embds = Dataset.load_from_disk(DataPaths.EMBD_PATH)

    def join_authors(x):
        x['authors'] = "|-|".join(x['authors'])
        return x

    import pdb ; pdb.set_trace()

    embds = embds.map(join_authors)

    # save id and title to disk
    embds.select_columns(['id', 'title', 'authors']).to_csv(DataPaths.FRONTEND_ITA_PATH)
    # save embeddings as torch tensor
    embds_weights = torch.Tensor(embds['embeddings'])
    torch.save(embds_weights, DataPaths.FRONTEND_WEIGHTS_PATH)

if __name__ == "__main__":
    main()