Spaces:
Sleeping
Sleeping
| from flask import Flask, request, jsonify | |
| import os | |
| import requests | |
| from PIL import Image | |
| import tempfile | |
| from PyPDF2 import PdfReader | |
| from threading import Thread | |
| import io | |
| import fitz | |
| from groq import Groq | |
| from queue import Queue | |
| import base64 | |
| # Initialize Flask app | |
| app = Flask(__name__) | |
| # Get API tokens from environment variables | |
| HF_TOKEN = os.environ.get('HF_TOKEN') | |
| GROQ_API_KEY = os.environ.get('GROQ_API_KEY') | |
| # Initialize Groq client | |
| client = Groq(api_key=GROQ_API_KEY) | |
| # Configuration for low memory mode (maintaining original functionality) | |
| LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1" | |
| print(f"Low memory mode: {LOW_MEMORY}") | |
| class TextStreamer: | |
| def __init__(self): | |
| self.queue = Queue() | |
| self.buffer = "" | |
| def put(self, text): | |
| self.queue.put(text) | |
| def __iter__(self): | |
| while True: | |
| if not self.queue.empty(): | |
| text = self.queue.get() | |
| if text is None: # End signal | |
| break | |
| yield text | |
| else: | |
| continue | |
| def extract_image_from_pdf(pdf_url, dpi=75): | |
| """ | |
| Extract first page of PDF as image in memory | |
| Args: | |
| pdf_url (str): URL of PDF | |
| dpi (int): Image resolution | |
| Returns: | |
| PIL.Image: First page as image or None | |
| """ | |
| try: | |
| # Download PDF | |
| print(f"Attempting to download PDF from: {pdf_url}") | |
| # Download PDF | |
| response = requests.get(pdf_url, timeout=30) | |
| response.raise_for_status() | |
| print(f"PDF download status code: {response.status_code}") | |
| # Open PDF from bytes | |
| print("Opening PDF document...") | |
| pdf_document = fitz.open(stream=response.content, filetype="pdf") | |
| # Get first page | |
| print("Getting first page...") | |
| first_page = pdf_document[0] | |
| # Render page to pixmap | |
| print("Rendering page to pixmap...") | |
| pix = first_page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72)) | |
| # Convert to PIL Image | |
| print("Converting to PIL Image...") | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| pdf_document.close() | |
| print("Successfully extracted image from PDF") | |
| return img | |
| except Exception as e: | |
| print(f"Error extracting first page: {e}") | |
| return None | |
| def predict_image(image_url, text, file_pref): | |
| try: | |
| if file_pref == 'img': | |
| response = requests.get(image_url) | |
| response.raise_for_status() | |
| image = Image.open(io.BytesIO(response.content)).convert("RGB") | |
| else: | |
| print("Extracting image from PDF...") | |
| image = extract_image_from_pdf(image_url) | |
| # Convert the image to base64 | |
| buffered = io.BytesIO() | |
| image.save(buffered, format="PNG") | |
| img_str = base64.b64encode(buffered.getvalue()).decode() | |
| image_url = f"data:image/png;base64,{img_str}" | |
| print(f"Image URL being sent to Groq: {image_url[:100]}...") | |
| streamer = TextStreamer() | |
| def generate_response(): | |
| try: | |
| completion = client.chat.completions.create( | |
| model="llama-3.2-11b-vision-preview", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image_url", | |
| "image_url": {"url": image_url} | |
| }, | |
| { | |
| "type": "text", | |
| "text": text | |
| } | |
| ] | |
| } | |
| ], | |
| temperature=0.7, | |
| max_tokens=4096, | |
| top_p=1, | |
| stream=True | |
| # temperature=1, | |
| # max_completion_tokens=4096, | |
| # top_p=1, | |
| # stream=False, | |
| # stop=None, | |
| ) | |
| print(f"Completions: {completion}") | |
| for chunk in completion: | |
| if chunk.choices[0].delta.content: | |
| streamer.put(chunk.choices[0].delta.content) | |
| streamer.put(None) # Signal the end | |
| except Exception as e: | |
| print(f"Error in generate_response: {e}") | |
| streamer.put(None) | |
| thread = Thread(target=generate_response) | |
| thread.start() | |
| buffer = "" | |
| for new_text in streamer: | |
| buffer += new_text | |
| # buffer = completion.choices[0].message | |
| # print(buffer) | |
| return buffer | |
| except Exception as e: | |
| raise ValueError(f"Error during prediction: {str(e)}") | |
| def extract_text_from_pdf(pdf_url): | |
| try: | |
| response = requests.get(pdf_url) | |
| response.raise_for_status() | |
| with tempfile.NamedTemporaryFile(delete=False) as temp_pdf: | |
| temp_pdf.write(response.content) | |
| temp_pdf_path = temp_pdf.name | |
| reader = PdfReader(temp_pdf_path) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| os.remove(temp_pdf_path) | |
| return text | |
| except Exception as e: | |
| raise ValueError(f"Error extracting text from PDF: {str(e)}") | |
| def predict_text(text): | |
| streamer = TextStreamer() | |
| def generate_response(): | |
| try: | |
| completion = client.chat.completions.create( | |
| model="meta-llama/llama-4-scout-17b-16e-instruct", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": text | |
| } | |
| ], | |
| temperature=0.7, | |
| max_tokens=4096, | |
| top_p=1, | |
| stream=True | |
| ) | |
| for chunk in completion: | |
| if chunk.choices[0].delta.content: | |
| streamer.put(chunk.choices[0].delta.content) | |
| streamer.put(None) # Signal the end | |
| except Exception as e: | |
| print(f"Error in generate_response: {e}") | |
| streamer.put(None) | |
| thread = Thread(target=generate_response) | |
| thread.start() | |
| buffer = "" | |
| for new_text in streamer: | |
| buffer += new_text | |
| return buffer | |
| # [Rest of the prompts remain exactly the same as in original] | |
| PROMPT = ( | |
| "Extract the following information as per this format:\n" | |
| "'Course Code:'\n" | |
| "'Course Name:'\n" | |
| "'Course Description:'\n" | |
| "'Course Credits:'\n" | |
| "'Course Learning Outcomes:'\n" | |
| "'Delivery Method:'\n" | |
| "'Prerequisite(s):'\n" | |
| "'Co-requisite(s):'\n" | |
| "'Materials:'\n" | |
| "'Topical Outline:'\n" | |
| "Do not add anything else except the required information from this text." | |
| ) | |
| PROMPT_SKILLS = ( | |
| "Provide skills based on the Lightcast Open Skills Taxonomy in categories as:\n" | |
| "'Primary Skills' (the degree program or certification),\n" | |
| "'Secondary Skills', and\n" | |
| "'Tertiary Skills'." | |
| ) | |
| PROMPT_IMAGE_STUDENT = ( | |
| "You are a highly intelligent assistant designed to analyze images and extract structured information from them. " | |
| "Your task is to analyze the given image of a student's academic record and generate a response in the exact JSON format provided below. " | |
| "If any specific information is missing or unavailable in the image, replace the corresponding field with null. " | |
| "Ensure the format is consistent, strictly adhering to the structure shown below.\n\n" | |
| "Required JSON Format:\n\n" | |
| "{\n" | |
| ' "student": {\n' | |
| ' "name": "string",\n' | |
| ' "id": "string",\n' | |
| ' "dob": "string",\n' | |
| ' "original_start_date": "string",\n' | |
| ' "cumulative_gpa": "string",\n' | |
| ' "program": "string",\n' | |
| ' "status": "string"\n' | |
| ' }\n' | |
| "}\n\n" | |
| "Instructions:\n\n" | |
| "1. Extract the student's general information as displayed in the image.\n" | |
| "2. Use null for any missing or unavailable information.\n" | |
| "3. Format the extracted data exactly as shown above. Do not deviate from this structure.\n" | |
| "4. Ensure accurate field names and proper nesting.\n" | |
| "5. Return only the 'student' section as JSON.\n" | |
| ) | |
| PROMPT_IMAGE_COURSES = ( | |
| "You are a highly intelligent assistant designed to analyze images and extract structured information from them. " | |
| "Your task is to analyze the given image of a student's academic record and generate a response in the exact JSON format provided below. " | |
| "If any specific information is missing or unavailable in the image, replace the corresponding field with null. " | |
| "Ensure the format is consistent, strictly adhering to the structure shown below.\n\n" | |
| "Required JSON Format:\n\n" | |
| "{\n" | |
| ' "courses": [\n' | |
| ' {\n' | |
| ' "transfer_institution": "string",\n' | |
| ' "course_code": "string",\n' | |
| ' "course_name": "string",\n' | |
| ' "credits_attempted": number,\n' | |
| ' "credits_earned": number,\n' | |
| ' "grade": "string",\n' | |
| ' "quality_points": number,\n' | |
| ' "semester_code": "string",\n' | |
| ' "semester_dates": "string"\n' | |
| ' }\n' | |
| " // Additional courses can be added here\n" | |
| " ]\n" | |
| "}\n\n" | |
| "Instructions:\n\n" | |
| "1. Extract the course details as displayed in the image.\n" | |
| "2. Use null for any missing or unavailable information.\n" | |
| "3. Format the extracted data exactly as shown above. Do not deviate from this structure.\n" | |
| "4. Ensure accurate field names and proper nesting.\n" | |
| "5. Return only the 'courses' section as JSON.\n" | |
| ) | |
| def home(): | |
| return jsonify({"message": "Welcome to the PDF Extraction API. Use the /extract endpoint to extract information."}) | |
| def favicon(): | |
| return "", 204 | |
| def extract_info(): | |
| data = request.json | |
| if not data: | |
| return jsonify({"error": "Please provide a PDF URL in the request body."}), 400 | |
| try: | |
| if data["url"] is not None: | |
| pdf_url = data["url"] | |
| pdf_text = extract_text_from_pdf(pdf_url) | |
| prompt = f"{PROMPT}\n\n{pdf_text}" | |
| response = predict_text(prompt) | |
| else: | |
| response = '' | |
| if data["skills"] == True: | |
| if response: | |
| prompt_skills = f"{PROMPT_SKILLS} using this information only -- {response}" | |
| response_skills = predict_text(prompt_skills) | |
| else: | |
| response_skills = '' | |
| else: | |
| response_skills = '' | |
| if data["img_url"] is not None: | |
| prompt_student = f"{PROMPT_IMAGE_STUDENT}\n" | |
| prompt_courses = f"{PROMPT_IMAGE_COURSES}\n" | |
| img_url = data["img_url"] | |
| file_pref = data["file_pref"] | |
| response_student = predict_image(img_url, prompt_student, file_pref) | |
| response_courses = predict_image(img_url, prompt_courses, file_pref) | |
| response_image = response_student + response_courses | |
| else: | |
| response_image = '' | |
| return jsonify({"extracted_info": response + "\n" + response_skills + "\n" + response_image}) | |
| except Exception as e: | |
| return jsonify({"error": str(e)}), 500 | |
| if __name__ == "__main__": | |
| app.run(host="0.0.0.0", port=7860) |