Baskar2005 commited on
Commit
4cbd83c
·
verified ·
1 Parent(s): e0d8c79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -37
app.py CHANGED
@@ -9,8 +9,7 @@ app = Flask(__name__)
9
 
10
  # ---------------- CONFIGURATION ---------------- #
11
  SESSION_FILE = "instagram_session.json"
12
- HEADLESS_MODE = False # Keep False for stability
13
- MAX_WORKERS = 3 # Reduced to 3 to prevent lagging your PC
14
  # ----------------------------------------------- #
15
 
16
  def identify_url_type(url):
@@ -21,41 +20,46 @@ def identify_url_type(url):
21
  if "instagram.com/" in url: return "PROFILE"
22
  return "UNKNOWN"
23
 
24
- # --- HELPER: RECURSIVE SEARCH ( The "Main.py" Logic ) ---
25
  def find_username_in_json(obj):
26
  if isinstance(obj, dict):
27
- # Priority 1: Check inside 'owner' object
28
  if "owner" in obj and isinstance(obj["owner"], dict):
29
- if "username" in obj["owner"]:
30
- return obj["owner"]["username"]
31
-
32
- # Priority 2: Check standard user object
33
- if "username" in obj and "is_verified" in obj:
34
- return obj["username"]
35
-
36
- # Recursive Loop
37
  for k, v in obj.items():
38
  if isinstance(v, (dict, list)):
39
  res = find_username_in_json(v)
40
  if res: return res
41
-
42
  elif isinstance(obj, list):
43
  for item in obj:
44
  res = find_username_in_json(item)
45
  if res: return res
46
  return None
47
 
48
- # --- WORKER FUNCTION ---
49
  def scrape_single_url(url):
50
  if not url or not url.strip(): return None
51
 
52
- # New Browser Instance for Thread Safety
53
  with sync_playwright() as p:
54
- browser = p.chromium.launch(headless=HEADLESS_MODE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  if os.path.exists(SESSION_FILE):
56
- context = browser.new_context(storage_state=SESSION_FILE)
57
  else:
58
- context = browser.new_context()
59
 
60
  page = context.new_page()
61
  print(f"⚡ Processing: {url}")
@@ -70,14 +74,12 @@ def scrape_single_url(url):
70
  "status": "Starting"
71
  }
72
 
73
- # Skip System Links
74
  if data["type"] in ["SYSTEM", "UNKNOWN"]:
75
  data["status"] = "Skipped"
76
  browser.close()
77
  return data
78
 
79
  try:
80
- # === PATH A: PROFILE ===
81
  if data["type"] == "PROFILE":
82
  page.goto(url, wait_until="domcontentloaded", timeout=60000)
83
  time.sleep(3)
@@ -93,14 +95,12 @@ def scrape_single_url(url):
93
  data["author"] = url.strip("/").split("/")[-1]
94
  data["status"] = "Success"
95
 
96
- # === PATH B: MEDIA (REEL/POST) ===
97
  elif data["type"] in ["REEL", "POST"]:
98
  if "/reel/" in url:
99
  shortcode = url.split("/reel/")[1].split("/")[0]
100
  else:
101
  shortcode = url.split("/p/")[1].split("/")[0]
102
 
103
- # 1. NETWORK LISTENER (Restored Robust Logic)
104
  captured_info = {"username": None}
105
 
106
  def handle_response(response):
@@ -117,7 +117,6 @@ def scrape_single_url(url):
117
  time.sleep(4)
118
  page.remove_listener("response", handle_response)
119
 
120
- # 2. GET LIKES (Meta Tag)
121
  try:
122
  meta_desc = page.locator('meta[property="og:description"]').get_attribute("content")
123
  if meta_desc:
@@ -125,27 +124,21 @@ def scrape_single_url(url):
125
  if likes_match: data["likes"] = likes_match.group(1)
126
  except: pass
127
 
128
- # 3. GET AUTHOR (Network > Title > Pattern)
129
  if captured_info["username"]:
130
  data["author"] = captured_info["username"]
131
 
132
- # Fallback: Title Tag
133
  if not data["author"]:
134
  try:
135
  title = page.title()
136
- # Matches "Username (@handle) on Instagram"
137
  match = re.search(r'\(@(.*?)\)', title)
138
- if match:
139
- data["author"] = match.group(1)
140
  else:
141
- # Matches "Username on Instagram" (Start of title)
142
  match_b = re.search(r'^(.*?)\son\sInstagram', title)
143
  if match_b:
144
  parts = match_b.group(1).split(" ")
145
  if len(parts) == 1: data["author"] = parts[0]
146
  except: pass
147
 
148
- # Fallback: Link Pattern
149
  if not data["author"]:
150
  try:
151
  links = page.locator("a[href*='/reels/']").all()
@@ -160,10 +153,8 @@ def scrape_single_url(url):
160
  break
161
  except: pass
162
 
163
- # 4. GET VIEWS (Hop to Profile)
164
  if data["author"]:
165
  is_video = False
166
- # Check if Reel or Video Post
167
  try:
168
  og_type = page.locator('meta[property="og:type"]').get_attribute("content")
169
  if og_type and "video" in og_type: is_video = True
@@ -171,7 +162,6 @@ def scrape_single_url(url):
171
  if data["type"] == "REEL": is_video = True
172
 
173
  if is_video:
174
- # Hop to Reels Tab
175
  profile_reels_url = f"https://www.instagram.com/{data['author']}/reels/"
176
  page.goto(profile_reels_url, wait_until="domcontentloaded")
177
  time.sleep(3)
@@ -193,7 +183,6 @@ def scrape_single_url(url):
193
  else:
194
  data["views"] = "N/A (Photo)"
195
 
196
- # Followers (Bonus)
197
  try:
198
  fol_link = page.locator("a[href*='/followers/']").first
199
  if fol_link.count() > 0:
@@ -207,13 +196,12 @@ def scrape_single_url(url):
207
  data["status"] = "Failed (No Author)"
208
 
209
  except Exception as e:
210
- data["status"] = f"Error"
211
  print(f"❌ Error: {e}")
212
 
213
  browser.close()
214
  return data
215
 
216
- # --- ROUTES ---
217
  @app.route('/')
218
  def home():
219
  return render_template('index.html')
@@ -249,4 +237,5 @@ def scrape_api():
249
  return jsonify(results)
250
 
251
  if __name__ == '__main__':
252
- app.run(debug=True, host='0.0.0.0',port=7860, use_reloader=False)
 
 
9
 
10
  # ---------------- CONFIGURATION ---------------- #
11
  SESSION_FILE = "instagram_session.json"
12
+ MAX_WORKERS = 3
 
13
  # ----------------------------------------------- #
14
 
15
  def identify_url_type(url):
 
20
  if "instagram.com/" in url: return "PROFILE"
21
  return "UNKNOWN"
22
 
23
+ # --- HELPER: RECURSIVE SEARCH ---
24
  def find_username_in_json(obj):
25
  if isinstance(obj, dict):
 
26
  if "owner" in obj and isinstance(obj["owner"], dict):
27
+ if "username" in obj["owner"]: return obj["owner"]["username"]
28
+ if "username" in obj and "is_verified" in obj: return obj["username"]
 
 
 
 
 
 
29
  for k, v in obj.items():
30
  if isinstance(v, (dict, list)):
31
  res = find_username_in_json(v)
32
  if res: return res
 
33
  elif isinstance(obj, list):
34
  for item in obj:
35
  res = find_username_in_json(item)
36
  if res: return res
37
  return None
38
 
 
39
  def scrape_single_url(url):
40
  if not url or not url.strip(): return None
41
 
 
42
  with sync_playwright() as p:
43
+ # 🔥 STEALTH CONFIGURATION 🔥
44
+ # 1. Hide the "Automation" flag
45
+ # 2. Force Headless=True (Required for Server Stability)
46
+ browser = p.chromium.launch(
47
+ headless=True,
48
+ args=["--disable-blink-features=AutomationControlled"]
49
+ )
50
+
51
+ # 3. Inject "Real Human" User-Agent (Windows Chrome)
52
+ # This prevents the "N/A" error by tricking Instagram
53
+ context_args = {
54
+ "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
55
+ "viewport": {"width": 1280, "height": 720},
56
+ "locale": "en-US"
57
+ }
58
+
59
  if os.path.exists(SESSION_FILE):
60
+ context = browser.new_context(storage_state=SESSION_FILE, **context_args)
61
  else:
62
+ context = browser.new_context(**context_args)
63
 
64
  page = context.new_page()
65
  print(f"⚡ Processing: {url}")
 
74
  "status": "Starting"
75
  }
76
 
 
77
  if data["type"] in ["SYSTEM", "UNKNOWN"]:
78
  data["status"] = "Skipped"
79
  browser.close()
80
  return data
81
 
82
  try:
 
83
  if data["type"] == "PROFILE":
84
  page.goto(url, wait_until="domcontentloaded", timeout=60000)
85
  time.sleep(3)
 
95
  data["author"] = url.strip("/").split("/")[-1]
96
  data["status"] = "Success"
97
 
 
98
  elif data["type"] in ["REEL", "POST"]:
99
  if "/reel/" in url:
100
  shortcode = url.split("/reel/")[1].split("/")[0]
101
  else:
102
  shortcode = url.split("/p/")[1].split("/")[0]
103
 
 
104
  captured_info = {"username": None}
105
 
106
  def handle_response(response):
 
117
  time.sleep(4)
118
  page.remove_listener("response", handle_response)
119
 
 
120
  try:
121
  meta_desc = page.locator('meta[property="og:description"]').get_attribute("content")
122
  if meta_desc:
 
124
  if likes_match: data["likes"] = likes_match.group(1)
125
  except: pass
126
 
 
127
  if captured_info["username"]:
128
  data["author"] = captured_info["username"]
129
 
 
130
  if not data["author"]:
131
  try:
132
  title = page.title()
 
133
  match = re.search(r'\(@(.*?)\)', title)
134
+ if match: data["author"] = match.group(1)
 
135
  else:
 
136
  match_b = re.search(r'^(.*?)\son\sInstagram', title)
137
  if match_b:
138
  parts = match_b.group(1).split(" ")
139
  if len(parts) == 1: data["author"] = parts[0]
140
  except: pass
141
 
 
142
  if not data["author"]:
143
  try:
144
  links = page.locator("a[href*='/reels/']").all()
 
153
  break
154
  except: pass
155
 
 
156
  if data["author"]:
157
  is_video = False
 
158
  try:
159
  og_type = page.locator('meta[property="og:type"]').get_attribute("content")
160
  if og_type and "video" in og_type: is_video = True
 
162
  if data["type"] == "REEL": is_video = True
163
 
164
  if is_video:
 
165
  profile_reels_url = f"https://www.instagram.com/{data['author']}/reels/"
166
  page.goto(profile_reels_url, wait_until="domcontentloaded")
167
  time.sleep(3)
 
183
  else:
184
  data["views"] = "N/A (Photo)"
185
 
 
186
  try:
187
  fol_link = page.locator("a[href*='/followers/']").first
188
  if fol_link.count() > 0:
 
196
  data["status"] = "Failed (No Author)"
197
 
198
  except Exception as e:
199
+ data["status"] = "Error"
200
  print(f"❌ Error: {e}")
201
 
202
  browser.close()
203
  return data
204
 
 
205
  @app.route('/')
206
  def home():
207
  return render_template('index.html')
 
237
  return jsonify(results)
238
 
239
  if __name__ == '__main__':
240
+ # HUGGING FACE REQUIRES PORT 7860
241
+ app.run(host='0.0.0.0', port=7860)