File size: 1,470 Bytes
565e754
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import argparse

from structlog import get_logger
import pandas as pd

from src.config import pyro_source, CHANNEL_ID


BATCH_SIZE = 256
logger = get_logger()


def save_batch(df: pd.DataFrame, out_path: str, is_first_batch: bool):
    if is_first_batch:
        df.to_csv(out_path, index=False, mode="w")
    else:
        df.to_csv(out_path, index=False, mode="a", header=False)


def main():
    parser = argparse.ArgumentParser(description="Telegram posts loader")

    parser.add_argument("--channel_id", type=str, default=CHANNEL_ID)
    parser.add_argument("--limit", type=int, required=True)
    parser.add_argument("--offset", type=int, default=0)

    args = parser.parse_args()
    total_limit = args.limit
    channel_id = args.channel_id
    base_offset = args.offset
    

    out_path = f"./channel_{channel_id}_posts.csv"
    is_first_batch = not os.path.exists(out_path)


    total_batches = (total_limit + BATCH_SIZE - 1) // BATCH_SIZE

    for batch_num in range(total_batches):
        logger.info(f"Batch #{batch_num} loading")

        current_offset = base_offset + batch_num * BATCH_SIZE

        posts = pyro_source.load_messages(
            channel_id=channel_id,
            limit=BATCH_SIZE,
            offset=current_offset
        )

        df = pd.DataFrame(posts)
        save_batch(df, out_path, is_first_batch)
        is_first_batch = False


    logger.info("Finished loading")




if __name__ == "__main__":
    main()