File size: 1,470 Bytes
565e754 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import os
import argparse
from structlog import get_logger
import pandas as pd
from src.config import pyro_source, CHANNEL_ID
BATCH_SIZE = 256
logger = get_logger()
def save_batch(df: pd.DataFrame, out_path: str, is_first_batch: bool):
if is_first_batch:
df.to_csv(out_path, index=False, mode="w")
else:
df.to_csv(out_path, index=False, mode="a", header=False)
def main():
parser = argparse.ArgumentParser(description="Telegram posts loader")
parser.add_argument("--channel_id", type=str, default=CHANNEL_ID)
parser.add_argument("--limit", type=int, required=True)
parser.add_argument("--offset", type=int, default=0)
args = parser.parse_args()
total_limit = args.limit
channel_id = args.channel_id
base_offset = args.offset
out_path = f"./channel_{channel_id}_posts.csv"
is_first_batch = not os.path.exists(out_path)
total_batches = (total_limit + BATCH_SIZE - 1) // BATCH_SIZE
for batch_num in range(total_batches):
logger.info(f"Batch #{batch_num} loading")
current_offset = base_offset + batch_num * BATCH_SIZE
posts = pyro_source.load_messages(
channel_id=channel_id,
limit=BATCH_SIZE,
offset=current_offset
)
df = pd.DataFrame(posts)
save_batch(df, out_path, is_first_batch)
is_first_batch = False
logger.info("Finished loading")
if __name__ == "__main__":
main()
|