Spaces:

RajatMalviya
/

edutech

Sleeping

App Files Files Community

RajatMalviya commited on Nov 2, 2025

Commit

f7c4217

verified ·

1 Parent(s): d3cba69

Update main.py

Browse files

Files changed (1) hide show

main.py +47 -364

main.py CHANGED Viewed

@@ -1,152 +1,68 @@
 from pydantic import BaseModel
 from typing import List, Optional
-from linkdin_job_data import (
-    JobCrawler,
-    generate_pdf_resume,
-    extract_text_from_pdf,
-    classify_resume_with_gemini,
-    ResumeData
-)
 import os
 from fastapi import FastAPI, File, UploadFile, HTTPException, Form
 from fastapi.responses import FileResponse
-from fastapi.middleware.cors import CORSMiddleware
 import requests
 import json
 import tempfile
-import mimetypes
-from pathlib import Path
-app = FastAPI(
-    title="LinkedIn Data API",
-    description="API for LinkedIn data extraction, job parsing, and resume generation",
-    version="1.0.0"
-)
-# Add CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
 def get_linkdin_data(url: str) -> dict:
-    """
-    Fetch LinkedIn data from BrightData API
-    Args:
-        url: LinkedIn profile URL
-    Returns:
-        dict: Response from BrightData API
-    Raises:
-        HTTPException: If API request fails
-    """
-    if not url or not url.strip():
-        raise HTTPException(status_code=400, detail="URL cannot be empty")
-    if "linkedin.com" not in url.lower():
-        raise HTTPException(status_code=400, detail="Please provide a valid LinkedIn URL")
     headers = {
         "Authorization": "Bearer a6032564743ea10f33ac03ad330ab5299a0a06d15b606ff33bd91b40c6c2c098",
         "Content-Type": "application/json",
     }
     data = json.dumps({
-        "input": [{"url": url}],
     })
-    try:
-        response = requests.post(
-            "https://api.brightdata.com/datasets/v3/scrape?dataset_id=gd_l1viktl72bvl7bjuj0&notify=false&include_errors=true",
-            headers=headers,
-            data=data,
-            timeout=30
-        )
-        response.raise_for_status()
-        return response.json()
-    except requests.exceptions.Timeout:
-        raise HTTPException(status_code=504, detail="Request to BrightData API timed out")
-    except requests.exceptions.RequestException as e:
-        raise HTTPException(status_code=502, detail=f"Failed to fetch LinkedIn data: {str(e)}")
 @app.get("/")
 def read_root():
-    """Root endpoint with API information"""
-    return {
-        "message": "Welcome to the LinkedIn Data API",
-        "version": "1.0.0",
-        "endpoints": {
-            "user_data": "/user_data?url=<linkedin_url>",
-            "job_extraction": "/job_extraction?url=<job_url>",
-            "user_history": "/user_history (POST with JSON file)",
-            "generate_resume": "/generate-resume (POST with JSON file)",
-            "upload_resume": "/upload-resume/ (POST with PDF file)"
-        }
-    }
 @app.get("/user_data")
 def get_user_data(url: str):
-    """
-    Get LinkedIn user data from provided URL
-    Args:
-        url: LinkedIn profile URL
-    Returns:
-        dict: User data from LinkedIn
-    """
     data = get_linkdin_data(url)
     return {"data": data}
 @app.get("/job_extraction")
 def job_extraction(url: str):
-    """
-    Extract job posting information from URL
-    Args:
-        url: Job posting URL
-    Returns:
-        dict: Extracted job data in markdown format
-    """
-    if not url or not url.strip():
-        raise HTTPException(status_code=400, detail="URL cannot be empty")
-    # Basic URL validation
-    if not url.startswith(('http://', 'https://')):
-        raise HTTPException(status_code=400, detail="Please provide a valid URL starting with http:// or https://")
     try:
         crawler = JobCrawler()
         job_data = crawler.crawl(url)
         markdown_output = crawler.to_markdown(job_data)
-        # Log the output
         print("\n" + "="*80)
         print(markdown_output)
         print("="*80)
         print("\n\nRAW MARKDOWN:\n")
         print(job_data['raw_markdown'])
-        return {
-            "success": True,
-            "job_data": job_data['raw_markdown'],
-            "formatted_output": markdown_output
-        }
     except Exception as e:
-        print(f"\n❌ Error during job extraction: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Failed to extract job data: {str(e)}")
 @app.post("/user_history")
@@ -165,77 +81,31 @@ async def user_history(
         Dictionary containing cleaned browser history
     """
-    # Validate file is provided
-    if not file.filename:
-        raise HTTPException(status_code=400, detail="No file provided")
-    # Validate file extension
-    if not file.filename.lower().endswith('.json'):
-        raise HTTPException(
-            status_code=400,
-            detail="Only JSON files are allowed. Please upload a .json file from Google Takeout"
-        )
-    # Validate max_entries
-    if max_entries is not None and max_entries < 1:
-        raise HTTPException(status_code=400, detail="max_entries must be at least 1")
-    if max_entries is not None and max_entries > 10000:
-        raise HTTPException(status_code=400, detail="max_entries cannot exceed 10000")
     try:
         # Read file content
         content = await file.read()
-        # Check if file is empty
-        if not content:
-            raise HTTPException(status_code=400, detail="Uploaded file is empty")
         # Parse JSON
-        try:
-            data = json.loads(content.decode('utf-8'))
-        except UnicodeDecodeError:
-            raise HTTPException(status_code=400, detail="File encoding is not valid UTF-8")
-        except json.JSONDecodeError as e:
-            raise HTTPException(
-                status_code=400,
-                detail=f"Invalid JSON format: {str(e)}"
-            )
-        # Validate data structure
-        if not isinstance(data, dict):
-            raise HTTPException(
-                status_code=400,
-                detail="JSON file must contain an object/dictionary at root level"
-            )
         # Extract browser history
         browser_history = data.get("Browser History", [])
         if not browser_history:
-            raise HTTPException(
-                status_code=400,
-                detail="No 'Browser History' key found in JSON file. Please ensure you uploaded a valid Google Takeout history file"
-            )
-        if not isinstance(browser_history, list):
-            raise HTTPException(
-                status_code=400,
-                detail="'Browser History' must be a list/array"
-            )
         # Limit to max_entries or all available entries
-        entries_to_process = min(len(browser_history), max_entries if max_entries else len(browser_history))
         # Process and clean entries
         history = []
         for i in range(entries_to_process):
             entry = browser_history[i]
-            # Validate entry is a dictionary
-            if not isinstance(entry, dict):
-                continue
             cleaned_entry = {
                 "title": entry.get("title", ""),
                 "url": entry.get("url", ""),
@@ -250,233 +120,46 @@ async def user_history(
             "returned_entries": len(history),
             "history_data": history
         }
-    except HTTPException:
-        raise
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
 @app.post("/generate-resume")
 async def generate_resume(file: UploadFile = File(...)):
-    """
-    Generate a PDF resume from JSON data
-    Args:
-        file: JSON file containing resume data
-    Returns:
-        FileResponse: Generated PDF resume
-    """
-    # Validate file is provided
-    if not file.filename:
-        raise HTTPException(status_code=400, detail="No file provided")
-    # Validate file extension
-    if not file.filename.lower().endswith('.json'):
-        raise HTTPException(
-            status_code=400,
-            detail="Only JSON files are allowed. Please upload a .json file with resume data"
-        )
-    # Validate content type
-    content_type = file.content_type
-    if content_type and 'application/json' not in content_type.lower():
-        raise HTTPException(
-            status_code=400,
-            detail=f"Invalid content type: {content_type}. Expected application/json"
-        )
     try:
-        # Read and parse JSON
-        content = await file.read()
-        if not content:
-            raise HTTPException(status_code=400, detail="Uploaded file is empty")
-        try:
-            data = json.loads(content.decode('utf-8'))
-        except UnicodeDecodeError:
-            raise HTTPException(status_code=400, detail="File encoding is not valid UTF-8")
-        except json.JSONDecodeError as e:
-            raise HTTPException(
-                status_code=400,
-                detail=f"Invalid JSON format: {str(e)}"
-            )
-        # Validate data structure
-        if not isinstance(data, dict):
-            raise HTTPException(
-                status_code=400,
-                detail="JSON must contain an object/dictionary at root level"
-            )
-        # Check for required fields
-        if 'name' not in data or not data['name']:
-            raise HTTPException(
-                status_code=400,
-                detail="JSON must contain a 'name' field with a non-empty value"
-            )
-        # Create temporary PDF file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
             output_path = tmp_pdf.name
-        # Generate PDF
         generate_pdf_resume(data, output_path)
-        # Validate PDF was created
-        if not os.path.exists(output_path):
-            raise HTTPException(status_code=500, detail="PDF file was not created")
-        if os.path.getsize(output_path) == 0:
-            os.remove(output_path)
-            raise HTTPException(status_code=500, detail="Generated PDF is empty")
-        # Return PDF file
-        return FileResponse(
-            output_path,
-            media_type="application/pdf",
-            filename="resume.pdf",
-            headers={
-                "Content-Disposition": "attachment; filename=resume.pdf"
-            }
-        )
-    except HTTPException:
-        raise
     except Exception as e:
-        print(f"❌ Error generating resume: {str(e)}")
-        # Clean up temporary file if it exists
-        if 'output_path' in locals() and os.path.exists(output_path):
-            try:
-                os.remove(output_path)
-            except:
-                pass
-        raise HTTPException(status_code=500, detail=f"Failed to generate resume: {str(e)}")
 @app.post("/upload-resume/", response_model=ResumeData)
 async def upload_resume(file: UploadFile = File(...)):
     """
     Upload a PDF resume, extract text, classify it via Gemini API, and return structured JSON.
-    Args:
-        file: PDF file containing resume
-    Returns:
-        ResumeData: Structured resume data
     """
-    # Validate file is provided
-    if not file.filename:
-        raise HTTPException(status_code=400, detail="No file provided")
-    # Validate file extension
-    if not file.filename.lower().endswith('.pdf'):
-        raise HTTPException(
-            status_code=400,
-            detail="Only PDF files are supported. Please upload a .pdf file"
-        )
-    # Validate content type
-    content_type = file.content_type
-    if content_type and 'application/pdf' not in content_type.lower():
-        raise HTTPException(
-            status_code=400,
-            detail=f"Invalid content type: {content_type}. Expected application/pdf"
-        )
-    tmp_path = None
-    try:
-        # Read file content
-        content = await file.read()
-        # Check if file is empty
-        if not content:
-            raise HTTPException(status_code=400, detail="Uploaded PDF file is empty")
-        # Validate PDF magic number (first 4 bytes should be %PDF)
-        if not content.startswith(b'%PDF'):
-            raise HTTPException(
-                status_code=400,
-                detail="File does not appear to be a valid PDF. File header is incorrect"
-            )
-        # Create temporary file
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
-            tmp.write(content)
-            tmp_path = tmp.name
-        # Extract text from PDF
-        try:
-            text = extract_text_from_pdf(tmp_path)
-        except Exception as e:
-            raise HTTPException(
-                status_code=400,
-                detail=f"Failed to extract text from PDF: {str(e)}"
-            )
-        # Validate extracted text
-        if not text or not text.strip():
-            raise HTTPException(
-                status_code=400,
-                detail="No readable text found in the PDF. Please ensure the PDF contains selectable text (not just images)"
-            )
-        # Check minimum text length
-        if len(text.strip()) < 50:
-            raise HTTPException(
-                status_code=400,
-                detail="Insufficient text content in PDF. Please upload a complete resume"
-            )
-        # Classify resume using Gemini
-        try:
-            parsed_data = classify_resume_with_gemini(text)
-        except HTTPException:
-            raise
-        except Exception as e:
-            raise HTTPException(
-                status_code=500,
-                detail=f"Failed to parse resume content: {str(e)}"
-            )
-        # Validate and return structured data
-        try:
-            validated = ResumeData(**parsed_data)
-            return validated
-        except Exception as e:
-            raise HTTPException(
-                status_code=500,
-                detail=f"Failed to validate parsed resume data: {str(e)}"
-            )
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Unexpected error processing resume: {str(e)}")
-    finally:
-        # Clean up temporary file
-        if tmp_path and os.path.exists(tmp_path):
-            try:
-                os.remove(tmp_path)
-            except Exception as e:
-                print(f"⚠️ Failed to remove temporary file {tmp_path}: {str(e)}")
-# Health check endpoint
-@app.get("/health")
-def health_check():
-    """Health check endpoint"""
-    return {
-        "status": "healthy",
-        "service": "LinkedIn Data API",
-        "version": "1.0.0"
-    }
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 from pydantic import BaseModel
 from typing import List, Optional
+from linkdin_job_data import JobCrawler, generate_pdf_resume, extract_text_from_pdf, classify_resume_with_gemini, ResumeData
 import os
 from fastapi import FastAPI, File, UploadFile, HTTPException, Form
 from fastapi.responses import FileResponse
 import requests
 import json
 import tempfile
+app = FastAPI()
 def get_linkdin_data(url: str) -> dict:
     headers = {
         "Authorization": "Bearer a6032564743ea10f33ac03ad330ab5299a0a06d15b606ff33bd91b40c6c2c098",
         "Content-Type": "application/json",
     }
     data = json.dumps({
+        "input": [{"url":url}],
     })
+    response = requests.post(
+        "https://api.brightdata.com/datasets/v3/scrape?dataset_id=gd_l1viktl72bvl7bjuj0&notify=false&include_errors=true",
+        headers=headers,
+        data=data
+    )
+    return response.json()
 @app.get("/")
 def read_root():
+    return {"message": "Welcome to the LinkedIn Data API"}
 @app.get("/user_data")
 def get_user_data(url: str):
     data = get_linkdin_data(url)
     return {"data": data}
 @app.get("/job_extraction")
 def job_extraction(url: str):
     try:
         crawler = JobCrawler()
         job_data = crawler.crawl(url)
         markdown_output = crawler.to_markdown(job_data)
+        # Return/print the markdown text
         print("\n" + "="*80)
         print(markdown_output)
         print("="*80)
+        # Optionally return raw markdown too
         print("\n\nRAW MARKDOWN:\n")
         print(job_data['raw_markdown'])
+        return {"job_data":job_data['raw_markdown']}
     except Exception as e:
+        print(f"Error: {str(e)}")
+        return {"error": str(e)}
+    except Exception as e:
+        print(f"\n❌ Error: {str(e)}")
 @app.post("/user_history")
         Dictionary containing cleaned browser history
     """
+    # Validate file type
+    if not file.filename.endswith('.json'):
+        raise HTTPException(status_code=400, detail="Only JSON files are allowed")
     try:
         # Read file content
         content = await file.read()
         # Parse JSON
+        data = json.loads(content.decode('utf-8'))
         # Extract browser history
         browser_history = data.get("Browser History", [])
         if not browser_history:
+            raise HTTPException(status_code=400, detail="No 'Browser History' found in JSON file")
         # Limit to max_entries or all available entries
+        entries_to_process = min(len(browser_history), max_entries)
         # Process and clean entries
         history = []
         for i in range(entries_to_process):
             entry = browser_history[i]
             cleaned_entry = {
                 "title": entry.get("title", ""),
                 "url": entry.get("url", ""),
             "returned_entries": len(history),
             "history_data": history
         }
+    except json.JSONDecodeError:
+        raise HTTPException(status_code=400, detail="Invalid JSON format")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
 @app.post("/generate-resume")
 async def generate_resume(file: UploadFile = File(...)):
     try:
+        data = json.load(file.file)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
             output_path = tmp_pdf.name
         generate_pdf_resume(data, output_path)
+        if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
+            raise HTTPException(status_code=500, detail="PDF generation failed")
+        return FileResponse(output_path, media_type="application/pdf", filename="resume.pdf")
     except Exception as e:
+        print("❌ Error:", e)
+        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/upload-resume/", response_model=ResumeData)
 async def upload_resume(file: UploadFile = File(...)):
     """
     Upload a PDF resume, extract text, classify it via Gemini API, and return structured JSON.
     """
+    if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are supported.")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+        tmp.write(await file.read())
+        tmp_path = tmp.name
+    try:
+        text = extract_text_from_pdf(tmp_path)
+        if not text:
+            raise HTTPException(status_code=400, detail="No readable text found in the PDF.")
+        parsed_data = classify_resume_with_gemini(text)
+        validated = ResumeData(**parsed_data)
+        return validated
+    finally:
+        os.remove(tmp_path)