File size: 15,032 Bytes
5c3dc0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
import streamlit as st
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from datetime import datetime
from urllib.parse import urljoin, urlparse

class InstagramScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
    
    def extract_instagram_data(self, url):
        """Extract data from Instagram profile or post"""
        scraped_data = {
            "url": url,
            "timestamp": datetime.now().isoformat(),
            "platform": "instagram",
            "images": [],
            "posts": [],
            "profile_info": {},
            "errors": []
        }
        
        try:
            # Determine if it's a profile or post URL
            if "/p/" in url or "/reel/" in url:
                # Single post
                scraped_data.update(self.extract_post_data(url))
            else:
                # Profile
                scraped_data.update(self.extract_profile_data(url))
                
        except Exception as e:
            scraped_data["errors"].append(f"Instagram scraping error: {str(e)}")
        
        # Check if we found any data
        if not scraped_data.get("images") and not scraped_data.get("posts") and not scraped_data.get("profile_info", {}).get("username"):
            scraped_data["errors"].append("No Instagram data found. This might be due to:")
            scraped_data["errors"].append("- Private or protected account")
            scraped_data["errors"].append("- Instagram's anti-scraping measures")
            scraped_data["errors"].append("- Network connectivity issues")
            scraped_data["errors"].append("- URL format issues")
        
        return scraped_data
    
    def extract_post_data(self, url):
        """Extract data from a single Instagram post"""
        post_data = {
            "post_type": "single_post",
            "images": [],
            "post_info": {}
        }
        
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Look for image URLs in the page
            # Instagram loads images dynamically, so we need to look for patterns
            page_text = response.text
            
            # Find image URLs in the page source
            image_patterns = [
                # Instagram post images (high quality)
                r'"display_url":"([^"]+)"',
                r'"display_src":"([^"]+)"',
                r'"src":"([^"]*\.jpg[^"]*)"',
                r'"src":"([^"]*\.jpeg[^"]*)"',
                r'"src":"([^"]*\.png[^"]*)"',
                # Direct image URLs
                r'https://[^"]*\.jpg[^"]*',
                r'https://[^"]*\.jpeg[^"]*',
                r'https://[^"]*\.png[^"]*',
                # Instagram CDN URLs (high quality)
                r'https://scontent[^"]*\.jpg[^"]*',
                r'https://scontent[^"]*\.jpeg[^"]*',
                r'https://scontent[^"]*\.png[^"]*',
                # Additional Instagram patterns
                r'"url":"([^"]*\.jpg[^"]*)"',
                r'"url":"([^"]*\.jpeg[^"]*)"',
                r'"url":"([^"]*\.png[^"]*)"'
            ]
            
            found_images = set()
            for pattern in image_patterns:
                matches = re.findall(pattern, page_text)
                for match in matches:
                    if match and ('instagram' in match.lower() or 'scontent' in match.lower()):
                        # Clean up the URL
                        clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
                        found_images.add(clean_url)
            
            # Convert to image objects
            for i, img_url in enumerate(list(found_images)):
                post_data["images"].append({
                    "src": img_url,
                    "alt": f"Instagram post image {i+1}",
                    "title": f"Instagram post image {i+1}",
                    "width": "",
                    "height": ""
                })
            
            # Extract post information
            post_data["post_info"] = {
                "url": url,
                "images_count": len(post_data["images"]),
                "scraped_at": datetime.now().isoformat()
            }
            
        except Exception as e:
            post_data["errors"] = [f"Failed to extract post data: {str(e)}"]
        
        return post_data
    
    def extract_profile_data(self, url):
        """Extract data from Instagram profile"""
        profile_data = {
            "profile_type": "account",
            "images": [],
            "profile_info": {},
            "posts": []
        }
        
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            page_text = response.text
            
            # Extract profile information
            profile_data["profile_info"] = self.extract_profile_info(soup, page_text)
            
            # Extract recent posts first
            profile_data["posts"] = self.extract_recent_posts(page_text)
            
            # Extract images from profile page
            profile_data["images"] = self.extract_profile_images(page_text)
            
            # Extract images from individual posts (higher quality)
            if profile_data["posts"]:
                post_images = self.extract_images_from_posts(profile_data["posts"], max_posts=3)
                if post_images:
                    profile_data["images"].extend(post_images)
            
        except Exception as e:
            profile_data["errors"] = [f"Failed to extract profile data: {str(e)}"]
        
        return profile_data
    
    def extract_profile_info(self, soup, page_text):
        """Extract profile information"""
        profile_info = {
            "username": "",
            "display_name": "",
            "bio": "",
            "followers": "",
            "following": "",
            "posts_count": ""
        }
        
        try:
            # Look for profile information in the page source
            # Instagram loads this data dynamically, so we need to parse JSON
            
            # Find JSON data in the page
            json_patterns = [
                r'window\._sharedData\s*=\s*({[^}]+})',
                r'"profile_page":\s*({[^}]+})',
                r'"user":\s*({[^}]+})'
            ]
            
            for pattern in json_patterns:
                matches = re.findall(pattern, page_text)
                if matches:
                    try:
                        data = json.loads(matches[0])
                        # Extract profile info from JSON
                        if "user" in data:
                            user_data = data["user"]
                            profile_info["username"] = user_data.get("username", "")
                            profile_info["display_name"] = user_data.get("full_name", "")
                            profile_info["bio"] = user_data.get("biography", "")
                            profile_info["followers"] = user_data.get("followed_by", {}).get("count", "")
                            profile_info["following"] = user_data.get("follows", {}).get("count", "")
                            profile_info["posts_count"] = user_data.get("media", {}).get("count", "")
                    except:
                        continue
            
            # Fallback: try to extract from HTML
            if not profile_info["username"]:
                title_tag = soup.find('title')
                if title_tag:
                    title_text = title_tag.get_text()
                    if '(' in title_text and ')' in title_text:
                        username = title_text.split('(')[1].split(')')[0]
                        profile_info["username"] = username
            
        except Exception as e:
            profile_info["error"] = f"Failed to extract profile info: {str(e)}"
        
        return profile_info
    
    def extract_profile_images(self, page_text):
        """Extract images from profile page"""
        images = []
        
        try:
            # Look for Instagram post images in the page source
            # Instagram stores post images in JSON data
            image_patterns = [
                # Instagram post images (high quality)
                r'"display_url":"([^"]+)"',
                r'"display_src":"([^"]+)"',
                r'"src":"([^"]*\.jpg[^"]*)"',
                r'"src":"([^"]*\.jpeg[^"]*)"',
                r'"src":"([^"]*\.png[^"]*)"',
                # Direct image URLs
                r'https://[^"]*\.jpg[^"]*',
                r'https://[^"]*\.jpeg[^"]*',
                r'https://[^"]*\.png[^"]*',
                # Instagram CDN URLs
                r'https://scontent[^"]*\.jpg[^"]*',
                r'https://scontent[^"]*\.jpeg[^"]*',
                r'https://scontent[^"]*\.png[^"]*',
                # Additional Instagram patterns
                r'"url":"([^"]*\.jpg[^"]*)"',
                r'"url":"([^"]*\.jpeg[^"]*)"',
                r'"url":"([^"]*\.png[^"]*)"'
            ]
            
            found_images = set()
            for pattern in image_patterns:
                matches = re.findall(pattern, page_text)
                for match in matches:
                    if match and ('instagram' in match.lower() or 'scontent' in match.lower()):
                        # Clean up the URL
                        clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
                        found_images.add(clean_url)
            
            # Convert to image objects
            for i, img_url in enumerate(list(found_images)):
                images.append({
                    "src": img_url,
                    "alt": f"Instagram post image {i+1}",
                    "title": f"Instagram post image {i+1}",
                    "width": "",
                    "height": ""
                })
            
        except Exception as e:
            st.error(f"Failed to extract profile images: {str(e)}")
        
        return images
    
    def extract_recent_posts(self, page_text):
        """Extract recent posts from profile"""
        posts = []
        
        try:
            # Look for post URLs in the page source
            post_patterns = [
                r'"shortcode":"([^"]+)"',
                r'/p/([^/"]+)',
                r'/reel/([^/"]+)'
            ]
            
            found_posts = set()
            for pattern in post_patterns:
                matches = re.findall(pattern, page_text)
                for match in matches:
                    if match:
                        found_posts.add(match)
            
            # Convert to post objects
            for i, post_code in enumerate(list(found_posts)[:10]):  # Convert set to list and limit to 10 posts
                posts.append({
                    "shortcode": post_code,
                    "url": f"https://www.instagram.com/p/{post_code}/",
                    "index": i + 1
                })
            
        except Exception as e:
            st.error(f"Failed to extract recent posts: {str(e)}")
        
        return posts
    
    def extract_images_from_posts(self, posts, max_posts=5):
        """Extract images from individual posts"""
        all_images = []
        
        try:
            for i, post in enumerate(posts[:max_posts]):
                try:
                    # Get the post page
                    post_url = post["url"]
                    response = self.session.get(post_url, timeout=10)
                    response.raise_for_status()
                    
                    # Extract images from this post
                    post_images = self.extract_post_images(response.text)
                    
                    # Add post context to images
                    for img in post_images:
                        img["post_url"] = post_url
                        img["post_index"] = i + 1
                        all_images.append(img)
                    
                    # Small delay to be respectful
                    time.sleep(1)
                    
                except Exception as e:
                    st.warning(f"Failed to extract images from post {post['shortcode']}: {str(e)}")
                    continue
            
        except Exception as e:
            st.error(f"Failed to extract images from posts: {str(e)}")
        
        return all_images
    
    def extract_post_images(self, page_text):
        """Extract images from a single post page"""
        images = []
        
        try:
            # Look for high-quality Instagram post images
            image_patterns = [
                # Instagram post images (high quality)
                r'"display_url":"([^"]+)"',
                r'"display_src":"([^"]+)"',
                # Instagram CDN URLs (highest quality)
                r'https://scontent[^"]*\.jpg[^"]*',
                r'https://scontent[^"]*\.jpeg[^"]*',
                r'https://scontent[^"]*\.png[^"]*',
                # Additional patterns
                r'"src":"([^"]*\.jpg[^"]*)"',
                r'"src":"([^"]*\.jpeg[^"]*)"',
                r'"src":"([^"]*\.png[^"]*)"'
            ]
            
            found_images = set()
            for pattern in image_patterns:
                matches = re.findall(pattern, page_text)
                for match in matches:
                    if match and ('scontent' in match.lower() or 'instagram' in match.lower()):
                        # Clean up the URL
                        clean_url = match.replace('\\u0026', '&').replace('\\/', '/')
                        found_images.add(clean_url)
            
            # Convert to image objects
            for i, img_url in enumerate(list(found_images)):
                images.append({
                    "src": img_url,
                    "alt": f"Instagram post image {i+1}",
                    "title": f"Instagram post image {i+1}",
                    "width": "",
                    "height": ""
                })
            
        except Exception as e:
            st.error(f"Failed to extract post images: {str(e)}")
        
        return images

# Global Instagram scraper instance
instagram_scraper = InstagramScraper()