File size: 4,848 Bytes
0b77e4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
##packages code
import pandas as pd
import numpy as np
import json, requests 
import pandas as pd

#import spacy
#!python -m spacy download en_core_web_lg
#spacy_nlp = spacy.load("en_core_web_lg")

#import en_core_web_lg
#spacy_nlp = en_core_web_lg.load()

#################### Test Scripts #############################

def URL(base_URL , entity_type , filters):
    url = base_URL + entity_type + filters 
    return url


def get_data(url):
    url = requests.get(url)
    text = url.text
    import json
    data = json.loads(text)
    return data

## encoding the abstract


def reconstruction_abstract(abstract_inverted_index):
    # return the abstract is the abstract exists in the database, else, return None
    
    if abstract_inverted_index != None:
        
        list_values = list(abstract_inverted_index.values())
        list_keys = list(abstract_inverted_index.keys())
        #from the words in the abstract (keys of abstract_inverted_index) and their position in the text (values of abstract_inverted_index), reconstruct the abstract
        
        size_abstract = max([ max(elem) for elem in abstract_inverted_index.values() ] )
        
        abstract = [""]*(size_abstract +1)
        
        for i in range(len(list_values)):
            for pos in list_values[i]:
                abstract[pos] = list_keys[i]
        
        return " ".join(list(abstract))
    
    else:
        return None
            
    
## calculate efficiently the dot product between two vectors

def norm(vector):
    return np.sqrt(sum(x * x for x in vector))    

def cosine_similarity2(vec_a, vec_b):
        norm_a = norm(vec_a)
        norm_b = norm(vec_b)
        dot = sum(a * b for a, b in zip(vec_a, vec_b))
        return dot / (norm_a * norm_b)

################################### Extracted texts ###############################################################

def extract_quantitative_data_paper( work_id):
    

    try:
        url = "https://api.openalex.org/works/" + str(work_id)
        data = get_data(url)
        date = data["publication_date"]
        title = data["title"]
        abstract = reconstruction_abstract(data["abstract_inverted_index"])
        concepts = ", ".join( [elem["display_name"] for elem in data["concepts"]] )
        authors = ", ".join( [elem["author"]["display_name"] for elem in data["authorships"]] )
        institutions = ", ".join( set([elem["institutions"][0]["display_name"] for elem in data["authorships"]]) )
        print("\033[96mFROM OpenAlex: ")
        print("\033[92mPaper link: "  + url)
        print(" ")
        print("\033[92mTitle: \x1b[0m"  + title)
        print(" ")
        print("\033[92mConcepts: \x1b[0m"  + concepts)
        print(" ")
        print("\033[92mDate: \x1b[0m"  + date)
        print(" ")
        print("\033[92mAuthors: \x1b[0m"  + authors)
        print(" ")
        print("\033[92mInstitutions: \x1b[0m"  + institutions)
        print(" ")
        # sentences = extract_sentences_with_numbers(abstract , "\033[92mAbstract\x1b[0m")
        return [url, date, title, abstract, concepts, authors, institutions]
    except:
        print("Enter a valid work id from OpenAlex")
        
   
def extract_quantitative_data_patent( patent_id):

    try: 
        url = "https://api.patentsview.org/patents/query?q={%22patent_id%22:%22" + str(patent_id) + "%22}&f=[%22patent_number%22,%22patent_title%22,%22patent_abstract%22,%22patent_date%22,%22inventor_last_name%22,%22inventor_first_name%22,%22assignee_organization%22]"
        data = get_data(url)["patents"][0]
        title = data["patent_title"]
        abstract = data["patent_abstract"]
        print("\033[96mFROM PATENTSVIEW:")
        print("\033[92mPatent link: \x1b[0m"  + url)
        print(" ")
        print("\033[92mTitle: \x1b[0m"  + title)
        print(" ")
        print("\033[92mAbstract: \x1b[0m"  + abstract)
        print(" ")
        print("\033[92mDate: \x1b[0m"  + data["patent_date"])
        print(" ")
        print("\033[92mInventors: \x1b[0m"   ,  ", ".join([ data["inventors"][i]["inventor_first_name"] + " " + data["inventors"][i]["inventor_last_name"] for i in range(len(data["inventors"])) ]) )
        print(" ")
        print("\033[92mAssignee: \x1b[0m" , ", ".join( [ str(data["assignees"][i]["assignee_organization"]) for i in range(len(data["assignees"])) ] ) )
        assignees = ', '.join( [ str(data["assignees"][i]["assignee_organization"]) for i in range(len(data["assignees"])) ] )
        inventors = ", ".join([ data["inventors"][i]["inventor_first_name"] + " " + data["inventors"][i]["inventor_last_name"] for i in range(len(data["inventors"])) ])
    except:
        print("Enter a valid patent_id from PatentsView")

    return data, url, assignees, inventors
    

#print("Do you want the quantitative information corresponding to a paper?")