textmeme_search / src /parsing /vk_meme_parser.py
Futyn-Maker
Deploy the app
7e1f5f6
import os
from typing import Optional, Dict, Any
from urllib.parse import urlparse
import requests
import vk_api
class VKMemeParser:
def __init__(self, token: str):
"""
Initialize the VK Meme Parser.
Args:
token (str): VK API access token.
"""
self.vk_session = vk_api.VkApi(token=token)
self.vk = self.vk_session.get_api()
def _process_post(self, post: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Process a single post and extract relevant information.
Args:
post (Dict[str, Any]): A dictionary containing post data.
Returns:
Optional[Dict[str, Any]]: A dictionary with post ID, text, and image URL if valid,
None otherwise.
"""
# Check if the post is valid
if (post.get("marked_as_ads") or
"is_pinned" in post or
"copy_history" in post or
len(post.get("attachments", [])) != 1 or
post["attachments"][0]["type"] != "photo"):
return None
post_id = post["id"]
text = post["text"].strip()
# Get the largest available photo
photo_sizes = post["attachments"][0]["photo"]["sizes"]
largest_photo = max(
photo_sizes,
key=lambda x: x["width"] * x["height"])
image_url = largest_photo["url"]
return {
"id": post_id,
"text": text,
"image_url": image_url
}
def get_memes(self, public_id: str) -> Dict[str, Any]:
"""
Retrieve and process all meme posts from a specified public page.
Args:
public_id (str): ID or short name of the public page.
Returns:
Dict[str, Any]: A dictionary containing the public's name and processed meme posts.
"""
memes = []
# Determine whether to use domain or owner_id
if public_id.isdigit() or (public_id.startswith("-")
and public_id[1:].isdigit()):
params: Dict[str, Any] = {"owner_id": int(public_id)}
else:
params: Dict[str, Any] = {"domain": public_id}
# Fetch public's name
group_info = self.vk.groups.getById(group_id=public_id)[0]
group_name = group_info['name']
# Process posts
offset = 0
while True:
# Fetch 100 posts at a time
params["count"] = 100
params["offset"] = offset
response = self.vk.wall.get(**params)
posts = response["items"]
for post in posts:
processed_post = self._process_post(post)
if processed_post:
memes.append(processed_post)
# Check if we've reached the end of posts
if len(posts) < 100:
break
offset = response["next_from"]
return {
"name": group_name,
"posts": memes
}
def download_image(
self,
image_url: str,
folder_path: str) -> Optional[str]:
"""
Download an image from the given URL and save it to the specified folder.
Args:
image_url (str): The URL of the image to download.
folder_path (str): The path to the folder where the image should be saved.
Returns:
Optional[str]: The path to the saved image file, or None if the download failed.
"""
try:
# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)
filename = os.path.basename(urlparse(image_url).path)
if not os.path.splitext(filename)[1]:
return None
image_path = os.path.join(folder_path, filename)
response = requests.get(image_url, stream=True)
response.raise_for_status() # Raise an exception for bad status codes
with open(image_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
return filename
except Exception as e:
print(f"Error downloading image from {image_url}: {str(e)}")
return None