RajatMalviya commited on
Commit
f7c4217
·
verified ·
1 Parent(s): d3cba69

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +47 -364
main.py CHANGED
@@ -1,152 +1,68 @@
1
  from pydantic import BaseModel
2
  from typing import List, Optional
3
- from linkdin_job_data import (
4
- JobCrawler,
5
- generate_pdf_resume,
6
- extract_text_from_pdf,
7
- classify_resume_with_gemini,
8
- ResumeData
9
- )
10
  import os
11
  from fastapi import FastAPI, File, UploadFile, HTTPException, Form
12
  from fastapi.responses import FileResponse
13
- from fastapi.middleware.cors import CORSMiddleware
14
  import requests
15
  import json
16
  import tempfile
17
- import mimetypes
18
- from pathlib import Path
19
 
20
 
21
- app = FastAPI(
22
- title="LinkedIn Data API",
23
- description="API for LinkedIn data extraction, job parsing, and resume generation",
24
- version="1.0.0"
25
- )
26
-
27
- # Add CORS middleware
28
- app.add_middleware(
29
- CORSMiddleware,
30
- allow_origins=["*"],
31
- allow_credentials=True,
32
- allow_methods=["*"],
33
- allow_headers=["*"],
34
- )
35
-
36
 
37
  def get_linkdin_data(url: str) -> dict:
38
- """
39
- Fetch LinkedIn data from BrightData API
40
-
41
- Args:
42
- url: LinkedIn profile URL
43
-
44
- Returns:
45
- dict: Response from BrightData API
46
-
47
- Raises:
48
- HTTPException: If API request fails
49
- """
50
- if not url or not url.strip():
51
- raise HTTPException(status_code=400, detail="URL cannot be empty")
52
-
53
- if "linkedin.com" not in url.lower():
54
- raise HTTPException(status_code=400, detail="Please provide a valid LinkedIn URL")
55
 
 
56
  headers = {
57
  "Authorization": "Bearer a6032564743ea10f33ac03ad330ab5299a0a06d15b606ff33bd91b40c6c2c098",
58
  "Content-Type": "application/json",
59
  }
60
 
61
  data = json.dumps({
62
- "input": [{"url": url}],
63
  })
64
 
65
- try:
66
- response = requests.post(
67
- "https://api.brightdata.com/datasets/v3/scrape?dataset_id=gd_l1viktl72bvl7bjuj0&notify=false&include_errors=true",
68
- headers=headers,
69
- data=data,
70
- timeout=30
71
- )
72
- response.raise_for_status()
73
- return response.json()
74
- except requests.exceptions.Timeout:
75
- raise HTTPException(status_code=504, detail="Request to BrightData API timed out")
76
- except requests.exceptions.RequestException as e:
77
- raise HTTPException(status_code=502, detail=f"Failed to fetch LinkedIn data: {str(e)}")
78
 
 
79
 
80
  @app.get("/")
81
  def read_root():
82
- """Root endpoint with API information"""
83
- return {
84
- "message": "Welcome to the LinkedIn Data API",
85
- "version": "1.0.0",
86
- "endpoints": {
87
- "user_data": "/user_data?url=<linkedin_url>",
88
- "job_extraction": "/job_extraction?url=<job_url>",
89
- "user_history": "/user_history (POST with JSON file)",
90
- "generate_resume": "/generate-resume (POST with JSON file)",
91
- "upload_resume": "/upload-resume/ (POST with PDF file)"
92
- }
93
- }
94
-
95
 
96
  @app.get("/user_data")
97
  def get_user_data(url: str):
98
- """
99
- Get LinkedIn user data from provided URL
100
-
101
- Args:
102
- url: LinkedIn profile URL
103
-
104
- Returns:
105
- dict: User data from LinkedIn
106
- """
107
  data = get_linkdin_data(url)
108
  return {"data": data}
109
 
110
 
111
  @app.get("/job_extraction")
112
  def job_extraction(url: str):
113
- """
114
- Extract job posting information from URL
115
-
116
- Args:
117
- url: Job posting URL
118
-
119
- Returns:
120
- dict: Extracted job data in markdown format
121
- """
122
- if not url or not url.strip():
123
- raise HTTPException(status_code=400, detail="URL cannot be empty")
124
-
125
- # Basic URL validation
126
- if not url.startswith(('http://', 'https://')):
127
- raise HTTPException(status_code=400, detail="Please provide a valid URL starting with http:// or https://")
128
-
129
  try:
130
  crawler = JobCrawler()
131
  job_data = crawler.crawl(url)
132
  markdown_output = crawler.to_markdown(job_data)
133
 
134
- # Log the output
135
  print("\n" + "="*80)
136
  print(markdown_output)
137
  print("="*80)
 
 
138
  print("\n\nRAW MARKDOWN:\n")
139
  print(job_data['raw_markdown'])
140
-
141
- return {
142
- "success": True,
143
- "job_data": job_data['raw_markdown'],
144
- "formatted_output": markdown_output
145
- }
146
 
147
  except Exception as e:
148
- print(f"\n❌ Error during job extraction: {str(e)}")
149
- raise HTTPException(status_code=500, detail=f"Failed to extract job data: {str(e)}")
 
 
150
 
151
 
152
  @app.post("/user_history")
@@ -165,77 +81,31 @@ async def user_history(
165
  Dictionary containing cleaned browser history
166
  """
167
 
168
- # Validate file is provided
169
- if not file.filename:
170
- raise HTTPException(status_code=400, detail="No file provided")
171
-
172
- # Validate file extension
173
- if not file.filename.lower().endswith('.json'):
174
- raise HTTPException(
175
- status_code=400,
176
- detail="Only JSON files are allowed. Please upload a .json file from Google Takeout"
177
- )
178
-
179
- # Validate max_entries
180
- if max_entries is not None and max_entries < 1:
181
- raise HTTPException(status_code=400, detail="max_entries must be at least 1")
182
-
183
- if max_entries is not None and max_entries > 10000:
184
- raise HTTPException(status_code=400, detail="max_entries cannot exceed 10000")
185
 
186
  try:
187
  # Read file content
188
  content = await file.read()
189
 
190
- # Check if file is empty
191
- if not content:
192
- raise HTTPException(status_code=400, detail="Uploaded file is empty")
193
-
194
  # Parse JSON
195
- try:
196
- data = json.loads(content.decode('utf-8'))
197
- except UnicodeDecodeError:
198
- raise HTTPException(status_code=400, detail="File encoding is not valid UTF-8")
199
- except json.JSONDecodeError as e:
200
- raise HTTPException(
201
- status_code=400,
202
- detail=f"Invalid JSON format: {str(e)}"
203
- )
204
-
205
- # Validate data structure
206
- if not isinstance(data, dict):
207
- raise HTTPException(
208
- status_code=400,
209
- detail="JSON file must contain an object/dictionary at root level"
210
- )
211
 
212
  # Extract browser history
213
  browser_history = data.get("Browser History", [])
214
 
215
  if not browser_history:
216
- raise HTTPException(
217
- status_code=400,
218
- detail="No 'Browser History' key found in JSON file. Please ensure you uploaded a valid Google Takeout history file"
219
- )
220
-
221
- if not isinstance(browser_history, list):
222
- raise HTTPException(
223
- status_code=400,
224
- detail="'Browser History' must be a list/array"
225
- )
226
 
227
  # Limit to max_entries or all available entries
228
- entries_to_process = min(len(browser_history), max_entries if max_entries else len(browser_history))
229
 
230
  # Process and clean entries
231
  history = []
232
  for i in range(entries_to_process):
233
  entry = browser_history[i]
234
 
235
- # Validate entry is a dictionary
236
- if not isinstance(entry, dict):
237
- continue
238
-
239
  cleaned_entry = {
240
  "title": entry.get("title", ""),
241
  "url": entry.get("url", ""),
@@ -250,233 +120,46 @@ async def user_history(
250
  "returned_entries": len(history),
251
  "history_data": history
252
  }
253
-
254
- except HTTPException:
255
- raise
256
  except Exception as e:
257
  raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
258
 
259
-
260
  @app.post("/generate-resume")
261
  async def generate_resume(file: UploadFile = File(...)):
262
- """
263
- Generate a PDF resume from JSON data
264
-
265
- Args:
266
- file: JSON file containing resume data
267
-
268
- Returns:
269
- FileResponse: Generated PDF resume
270
- """
271
- # Validate file is provided
272
- if not file.filename:
273
- raise HTTPException(status_code=400, detail="No file provided")
274
-
275
- # Validate file extension
276
- if not file.filename.lower().endswith('.json'):
277
- raise HTTPException(
278
- status_code=400,
279
- detail="Only JSON files are allowed. Please upload a .json file with resume data"
280
- )
281
-
282
- # Validate content type
283
- content_type = file.content_type
284
- if content_type and 'application/json' not in content_type.lower():
285
- raise HTTPException(
286
- status_code=400,
287
- detail=f"Invalid content type: {content_type}. Expected application/json"
288
- )
289
-
290
  try:
291
- # Read and parse JSON
292
- content = await file.read()
293
-
294
- if not content:
295
- raise HTTPException(status_code=400, detail="Uploaded file is empty")
296
-
297
- try:
298
- data = json.loads(content.decode('utf-8'))
299
- except UnicodeDecodeError:
300
- raise HTTPException(status_code=400, detail="File encoding is not valid UTF-8")
301
- except json.JSONDecodeError as e:
302
- raise HTTPException(
303
- status_code=400,
304
- detail=f"Invalid JSON format: {str(e)}"
305
- )
306
-
307
- # Validate data structure
308
- if not isinstance(data, dict):
309
- raise HTTPException(
310
- status_code=400,
311
- detail="JSON must contain an object/dictionary at root level"
312
- )
313
-
314
- # Check for required fields
315
- if 'name' not in data or not data['name']:
316
- raise HTTPException(
317
- status_code=400,
318
- detail="JSON must contain a 'name' field with a non-empty value"
319
- )
320
-
321
- # Create temporary PDF file
322
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
323
  output_path = tmp_pdf.name
324
-
325
- # Generate PDF
326
  generate_pdf_resume(data, output_path)
327
-
328
- # Validate PDF was created
329
- if not os.path.exists(output_path):
330
- raise HTTPException(status_code=500, detail="PDF file was not created")
331
-
332
- if os.path.getsize(output_path) == 0:
333
- os.remove(output_path)
334
- raise HTTPException(status_code=500, detail="Generated PDF is empty")
335
-
336
- # Return PDF file
337
- return FileResponse(
338
- output_path,
339
- media_type="application/pdf",
340
- filename="resume.pdf",
341
- headers={
342
- "Content-Disposition": "attachment; filename=resume.pdf"
343
- }
344
- )
345
-
346
- except HTTPException:
347
- raise
348
  except Exception as e:
349
- print(f"❌ Error generating resume: {str(e)}")
350
- # Clean up temporary file if it exists
351
- if 'output_path' in locals() and os.path.exists(output_path):
352
- try:
353
- os.remove(output_path)
354
- except:
355
- pass
356
- raise HTTPException(status_code=500, detail=f"Failed to generate resume: {str(e)}")
357
 
358
 
359
  @app.post("/upload-resume/", response_model=ResumeData)
360
  async def upload_resume(file: UploadFile = File(...)):
361
  """
362
  Upload a PDF resume, extract text, classify it via Gemini API, and return structured JSON.
363
-
364
- Args:
365
- file: PDF file containing resume
366
-
367
- Returns:
368
- ResumeData: Structured resume data
369
  """
370
- # Validate file is provided
371
- if not file.filename:
372
- raise HTTPException(status_code=400, detail="No file provided")
373
-
374
- # Validate file extension
375
- if not file.filename.lower().endswith('.pdf'):
376
- raise HTTPException(
377
- status_code=400,
378
- detail="Only PDF files are supported. Please upload a .pdf file"
379
- )
380
-
381
- # Validate content type
382
- content_type = file.content_type
383
- if content_type and 'application/pdf' not in content_type.lower():
384
- raise HTTPException(
385
- status_code=400,
386
- detail=f"Invalid content type: {content_type}. Expected application/pdf"
387
- )
388
-
389
- tmp_path = None
390
-
391
- try:
392
- # Read file content
393
- content = await file.read()
394
-
395
- # Check if file is empty
396
- if not content:
397
- raise HTTPException(status_code=400, detail="Uploaded PDF file is empty")
398
-
399
- # Validate PDF magic number (first 4 bytes should be %PDF)
400
- if not content.startswith(b'%PDF'):
401
- raise HTTPException(
402
- status_code=400,
403
- detail="File does not appear to be a valid PDF. File header is incorrect"
404
- )
405
-
406
- # Create temporary file
407
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
408
- tmp.write(content)
409
- tmp_path = tmp.name
410
-
411
- # Extract text from PDF
412
- try:
413
- text = extract_text_from_pdf(tmp_path)
414
- except Exception as e:
415
- raise HTTPException(
416
- status_code=400,
417
- detail=f"Failed to extract text from PDF: {str(e)}"
418
- )
419
-
420
- # Validate extracted text
421
- if not text or not text.strip():
422
- raise HTTPException(
423
- status_code=400,
424
- detail="No readable text found in the PDF. Please ensure the PDF contains selectable text (not just images)"
425
- )
426
-
427
- # Check minimum text length
428
- if len(text.strip()) < 50:
429
- raise HTTPException(
430
- status_code=400,
431
- detail="Insufficient text content in PDF. Please upload a complete resume"
432
- )
433
-
434
- # Classify resume using Gemini
435
- try:
436
- parsed_data = classify_resume_with_gemini(text)
437
- except HTTPException:
438
- raise
439
- except Exception as e:
440
- raise HTTPException(
441
- status_code=500,
442
- detail=f"Failed to parse resume content: {str(e)}"
443
- )
444
-
445
- # Validate and return structured data
446
- try:
447
- validated = ResumeData(**parsed_data)
448
- return validated
449
- except Exception as e:
450
- raise HTTPException(
451
- status_code=500,
452
- detail=f"Failed to validate parsed resume data: {str(e)}"
453
- )
454
-
455
- except HTTPException:
456
- raise
457
- except Exception as e:
458
- raise HTTPException(status_code=500, detail=f"Unexpected error processing resume: {str(e)}")
459
-
460
- finally:
461
- # Clean up temporary file
462
- if tmp_path and os.path.exists(tmp_path):
463
- try:
464
- os.remove(tmp_path)
465
- except Exception as e:
466
- print(f"⚠️ Failed to remove temporary file {tmp_path}: {str(e)}")
467
 
 
 
 
468
 
469
- # Health check endpoint
470
- @app.get("/health")
471
- def health_check():
472
- """Health check endpoint"""
473
- return {
474
- "status": "healthy",
475
- "service": "LinkedIn Data API",
476
- "version": "1.0.0"
477
- }
478
 
 
 
 
479
 
480
- if __name__ == "__main__":
481
- import uvicorn
482
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
  from pydantic import BaseModel
2
  from typing import List, Optional
3
+ from linkdin_job_data import JobCrawler, generate_pdf_resume, extract_text_from_pdf, classify_resume_with_gemini, ResumeData
 
 
 
 
 
 
4
  import os
5
  from fastapi import FastAPI, File, UploadFile, HTTPException, Form
6
  from fastapi.responses import FileResponse
 
7
  import requests
8
  import json
9
  import tempfile
 
 
10
 
11
 
12
+ app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def get_linkdin_data(url: str) -> dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+
17
  headers = {
18
  "Authorization": "Bearer a6032564743ea10f33ac03ad330ab5299a0a06d15b606ff33bd91b40c6c2c098",
19
  "Content-Type": "application/json",
20
  }
21
 
22
  data = json.dumps({
23
+ "input": [{"url":url}],
24
  })
25
 
26
+ response = requests.post(
27
+ "https://api.brightdata.com/datasets/v3/scrape?dataset_id=gd_l1viktl72bvl7bjuj0&notify=false&include_errors=true",
28
+ headers=headers,
29
+ data=data
30
+ )
 
 
 
 
 
 
 
 
31
 
32
+ return response.json()
33
 
34
  @app.get("/")
35
  def read_root():
36
+ return {"message": "Welcome to the LinkedIn Data API"}
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  @app.get("/user_data")
39
  def get_user_data(url: str):
 
 
 
 
 
 
 
 
 
40
  data = get_linkdin_data(url)
41
  return {"data": data}
42
 
43
 
44
  @app.get("/job_extraction")
45
  def job_extraction(url: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  try:
47
  crawler = JobCrawler()
48
  job_data = crawler.crawl(url)
49
  markdown_output = crawler.to_markdown(job_data)
50
 
51
+ # Return/print the markdown text
52
  print("\n" + "="*80)
53
  print(markdown_output)
54
  print("="*80)
55
+
56
+ # Optionally return raw markdown too
57
  print("\n\nRAW MARKDOWN:\n")
58
  print(job_data['raw_markdown'])
59
+ return {"job_data":job_data['raw_markdown']}
 
 
 
 
 
60
 
61
  except Exception as e:
62
+ print(f"Error: {str(e)}")
63
+ return {"error": str(e)}
64
+ except Exception as e:
65
+ print(f"\n❌ Error: {str(e)}")
66
 
67
 
68
  @app.post("/user_history")
 
81
  Dictionary containing cleaned browser history
82
  """
83
 
84
+ # Validate file type
85
+ if not file.filename.endswith('.json'):
86
+ raise HTTPException(status_code=400, detail="Only JSON files are allowed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  try:
89
  # Read file content
90
  content = await file.read()
91
 
 
 
 
 
92
  # Parse JSON
93
+ data = json.loads(content.decode('utf-8'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  # Extract browser history
96
  browser_history = data.get("Browser History", [])
97
 
98
  if not browser_history:
99
+ raise HTTPException(status_code=400, detail="No 'Browser History' found in JSON file")
 
 
 
 
 
 
 
 
 
100
 
101
  # Limit to max_entries or all available entries
102
+ entries_to_process = min(len(browser_history), max_entries)
103
 
104
  # Process and clean entries
105
  history = []
106
  for i in range(entries_to_process):
107
  entry = browser_history[i]
108
 
 
 
 
 
109
  cleaned_entry = {
110
  "title": entry.get("title", ""),
111
  "url": entry.get("url", ""),
 
120
  "returned_entries": len(history),
121
  "history_data": history
122
  }
123
+ except json.JSONDecodeError:
124
+ raise HTTPException(status_code=400, detail="Invalid JSON format")
 
125
  except Exception as e:
126
  raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
127
 
 
128
  @app.post("/generate-resume")
129
  async def generate_resume(file: UploadFile = File(...)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  try:
131
+ data = json.load(file.file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
133
  output_path = tmp_pdf.name
 
 
134
  generate_pdf_resume(data, output_path)
135
+ if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
136
+ raise HTTPException(status_code=500, detail="PDF generation failed")
137
+ return FileResponse(output_path, media_type="application/pdf", filename="resume.pdf")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  except Exception as e:
139
+ print("❌ Error:", e)
140
+ raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
141
 
142
 
143
  @app.post("/upload-resume/", response_model=ResumeData)
144
  async def upload_resume(file: UploadFile = File(...)):
145
  """
146
  Upload a PDF resume, extract text, classify it via Gemini API, and return structured JSON.
 
 
 
 
 
 
147
  """
148
+ if not file.filename.lower().endswith(".pdf"):
149
+ raise HTTPException(status_code=400, detail="Only PDF files are supported.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
152
+ tmp.write(await file.read())
153
+ tmp_path = tmp.name
154
 
155
+ try:
156
+ text = extract_text_from_pdf(tmp_path)
157
+ if not text:
158
+ raise HTTPException(status_code=400, detail="No readable text found in the PDF.")
 
 
 
 
 
159
 
160
+ parsed_data = classify_resume_with_gemini(text)
161
+ validated = ResumeData(**parsed_data)
162
+ return validated
163
 
164
+ finally:
165
+ os.remove(tmp_path)