File size: 3,782 Bytes
565e754
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import time
import datetime
from typing import Union, Generator, List, Dict, Any, Optional

from pyrogram import Client
from pyrogram.types import Message


class PyroSource:

    def __init__(
            self, 
            api_id: Union[int, str], 
            api_hash: str, 
            app_name: str = "default_app",
        ):
        self.client = Client(name=app_name, api_id=api_id, api_hash=api_hash)

    
    def load_messages(
        self, 
        channel_id: Union[int, str], 
        limit: int, 
        offset: int = 0, 
        offset_id: int = 0,
        time_sleep: float = 0.05,
    ) -> List[Dict[str, Any]]:
        """
        channel_id: channel id or username
        limit: number of messages to load
        offset: offset index
        offset_id: message id offset
        """
        posts = []

        with self.client as app:
            messages: Generator[Message] = app.get_chat_history(
                chat_id=channel_id, 
                limit=limit, 
                offset=offset, 
                offset_id=offset_id,
            )

            for msg in messages:
                time.sleep(time_sleep)

                content = msg.text or msg.caption or ''
                original_author = (
                    msg.forward_from_chat.username if msg.forward_from_chat else ''
                )
                message_dt = msg.date.strftime("%Y-%m-%d")

                meta = {
                    "message_dt" : message_dt,
                    "message_id" : msg.id,
                    "channel_id" : channel_id,
                    "content" : content,
                    "views" : msg.views,
                    "original_author" : original_author,
                }

                posts.append(meta)
        
        return posts


    def load_days(
        self,
        channel_id: Union[int, str],
        from_date: datetime.date,
        to_date: Optional[datetime.date] = None,
        limit: int = 1000,
        time_sleep: float = 0.05,
    ) -> List[Dict[str, Any]]:
        """
        Загружает сообщения в диапазоне дат [from_date, to_date]

        channel_id: channel id or username
        from_date: дата начала (включительно)
        to_date: дата конца (включительно)
        limit: safety limit
        """
        posts = []

        offset_date = datetime.datetime.combine(
            from_date + datetime.timedelta(days=1),
            datetime.time.min
        )

        with self.client as app:
            messages: Generator[Message] = app.get_chat_history(
                chat_id=channel_id,
                limit=limit,
                offset_date=offset_date,
            )

            for msg in messages:
                time.sleep(time_sleep)

                msg_date = msg.date.date()

                # если ушли слишком далеко в прошлое — стоп
                if msg_date < from_date:
                    break

                # если задан to_date и сообщение новее — пропускаем
                if to_date and msg_date > to_date:
                    continue

                content = msg.text or msg.caption or ''
                original_author = (
                    msg.forward_from_chat.username if msg.forward_from_chat else ''
                )

                meta = {
                    "message_dt": msg_date.isoformat(),
                    "message_id": msg.id,
                    "channel_id": channel_id,
                    "content": content,
                    "views": msg.views,
                    "original_author": original_author,
                }

                posts.append(meta)

        return posts