File size: 2,619 Bytes
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c02ce2
48c27bb
 
 
 
3c02ce2
48c27bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import time
import csv
from wiki_data_fetcher import (
    get_previous_revisions,
    extract_revision_info,
    get_wikipedia_introduction,
)

title = []
revid_0, revid_10, revid_100 = [], [], []
ts_0, ts_10, ts_100 = [], [], []
intro_0, intro_10, intro_100 = [], [], []


if __name__ == "__main__":

    # Open the file in read mode
    with open("data/wikipedia_titles.txt", "r") as file:
        # Iterate through each line in the file
        for line in file:
            # Get title from each line without trailing newline characters
            this_title = line.strip()
            print(this_title)
            # Append title
            title.append(this_title)
            # Get info for most recent 100 revisions
            json_data = get_previous_revisions(this_title, revisions=100)
            # Append data for current revision
            info_0 = extract_revision_info(json_data, 0)
            revid_0.append(info_0["revid"])
            ts_0.append(info_0["timestamp"])
            intro_0.append(get_wikipedia_introduction(info_0["revid"]))
            # Append data for 10th revision before current
            info_10 = extract_revision_info(json_data, 10, limit_revnum=False)
            revid_10.append(info_10["revid"])
            ts_10.append(info_10["timestamp"])
            intro_10.append(get_wikipedia_introduction(info_10["revid"]))
            # Append data for 100th revision before current
            info_100 = extract_revision_info(json_data, 100, limit_revnum=False)
            revid_100.append(info_100["revid"])
            ts_100.append(info_100["timestamp"])
            intro_100.append(get_wikipedia_introduction(info_100["revid"]))

            # Write the CSV in each loop in case we need to restart after an error
            # Combine the lists
            # fmt: off
            export_data = zip(
                title, revid_0, revid_10, revid_100,
                ts_0, ts_10, ts_100, intro_0, intro_10, intro_100,
            )
            column_names = [
                "title", "revid_0", "revid_10", "revid_100",
                "ts_0", "ts_10", "ts_100",
                "intro_0", "intro_10", "intro_100",
            ]
            # fmt: on

            with open(
                "data/wikipedia_introductions.csv", "w", newline="", encoding="utf-8"
            ) as myfile:
                wr = csv.writer(myfile)
                # Write a header row
                wr.writerow(column_names)
                # Write the combined data rows
                wr.writerows(export_data)

            # Rate limit our API calls
            time.sleep(5)