File size: 4,831 Bytes
a952212
 
 
 
 
 
 
 
 
 
f9ae632
a952212
 
 
f9ae632
a952212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9ae632
 
a952212
f9ae632
a952212
 
 
 
f9ae632
 
 
a952212
 
 
 
 
 
f9ae632
 
 
 
 
a952212
 
 
f9ae632
a952212
 
f9ae632
 
a952212
f9ae632
a952212
f9ae632
a952212
 
 
f9ae632
 
a952212
 
 
f9ae632
a952212
 
 
bfbf372
a952212
f9ae632
 
 
 
 
 
 
a952212
 
f9ae632
 
 
bfbf372
 
 
 
 
2925c24
a952212
 
bfbf372
 
 
 
 
 
 
 
 
 
 
a952212
bfbf372
 
 
 
 
a952212
 
 
 
f9ae632
a952212
 
 
 
f9ae632
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import base64
import os
import io
import json
from fastapi import FastAPI, HTTPException, File, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from PyPDF2 import PdfReader
from PIL import Image
import fitz  # PyMuPDF
import openai
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

if not openai.api_key:
    raise RuntimeError("Missing OpenAI API key. Please set OPENAI_API_KEY in the environment variables.")

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

def vision(file_content):
    """Extract text from images inside a PDF using PyMuPDF & OCR."""
    pdf_document = fitz.open(stream=file_content, filetype="pdf")
    base64_images = []
    vision_data = [{"type": "text", "text": "Extract all text from these images."}]

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()

        # Convert the image to a PIL image
        img = Image.open(io.BytesIO(pix.tobytes("png")))

        # Convert the image to base64
        buffered = io.BytesIO()
        img.save(buffered, format="PNG")
        img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
        base64_images.append(img_base64)

        vision_data.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/png;base64,{img_base64}"},
        })

    print("PDF pages converted to images successfully!")

    # Send images to GPT-4o for processing
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": vision_data}],
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error in GPT-4o vision processing: {str(e)}")

@app.post("/get_ocr_data/")
def get_data(input_file: UploadFile = File(...)):
    """Extract structured data from a PDF resume."""
    try:
        # Read the uploaded file
        file_content = input_file.file.read()
        file_type = input_file.content_type
        extracted_text = ""

        if file_type == "application/pdf":
            pdf_reader = PdfReader(io.BytesIO(file_content))

            for page in pdf_reader.pages:
                text = page.extract_text()
                if text:
                    extracted_text += text + "\n"

            if not extracted_text.strip():  # If no text found, use vision processing
                print("\nVision OCR running...\n")
                extracted_text = vision(file_content)
        else:
            raise HTTPException(status_code=400, detail="Unsupported file type")

        print("Extracted Text:\n", extracted_text.strip())

        # Call GPT-4o to process extracted text into structured JSON
        prompt = f"""
        This is CV data: {extracted_text.strip()}.
        IMPORTANT: The output should be a JSON array! Make sure the JSON is valid. If no data is found, fill missing fields with "none". Do not add any extra explanation text.
        Need only JSON output.
        
        Example Output:
        ```json
        [
           "firstname": "firstname",
           "lastname": "lastname",
           "email": "email",
           "contact_number": "contact number",
           "home_address": "full home address",
           "home_town": "home town or city",
           "total_years_of_experience": "total years of experience",
           "education": "Institution Name, Country, Degree Name, Graduation Year; Institution Name, Country, Degree Name, Graduation Year",
           "LinkedIn_link": "LinkedIn link",
           "experience": "experience",
           "industry": "industry of work",
           "skills": "skills (Identify and list specific skills mentioned in both the skills section and inferred from the experience section), formatted as: Skill 1, Skill 2, Skill 3, Skill 4, Skill 5",
           "positions": ["Job title 1, Job title 2, Job title 3"]
        ]
        ```
        """

        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an assistant that processes CV data into structured JSON."},
                {"role": "user", "content": prompt}
            ]
        )

        # Ensure valid JSON output
        json_response = response["choices"][0]["message"]["content"].replace("json", "").replace("```", "").strip()
        structured_data = json.loads(json_response)

        return {"data": structured_data}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")