Spaces:

sam0303
/

web_scraper

Runtime error

File size: 6,494 Bytes

e5b9101

import os
import time
from selenium import webdriver
import base64
import requests
import json
import csv
import gradio as gr
from openai import OpenAI
import uuid 


def capture_full_page_screenshots(url, output_folder, scroll_size=400):
    driver = webdriver.Chrome()
    driver.get(url)
    driver.maximize_window()

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    total_height = driver.execute_script("return document.body.scrollHeight")
    scroll_position = 0
    
    while scroll_position < total_height:
        # Generate a random UUID string for each screenshot
        random_string = str(uuid.uuid4())
        screenshot_path = os.path.join(output_folder, f"screenshot_{random_string}.png")
        driver.save_screenshot(screenshot_path)
        print(f"Saved {screenshot_path}")
        
        scroll_position += scroll_size
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(1)

    driver.quit()
    return f"Screenshots saved to {output_folder}"



def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def vision(api_key, folder_path):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    csv_file_path = 'product_details.csv'
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
        csv_writer = csv.writer(file)
        csv_writer.writerow(["Product Name", "Product Price"]) 

        for filename in os.listdir(folder_path):
            if filename.endswith(".png"):
                image_path = os.path.join(folder_path, filename)
                base64_image = encode_image(image_path)

                payload = {
                    "model": "gpt-4-turbo",
                    "messages": [
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": """The image might contain name of some products and their respective pricing.
                                    Identify them. Ignore the partially visible names. Return me the details in json format. 
                                    The json output should have two variables: 1. Product Name   2. Product Price
                                    You should only pass the json output and say nothing else. Just the json output in needed
                                    """
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/jpeg;base64,{base64_image}"
                                    }
                                }
                            ]
                        }
                    ],
                    "max_tokens": 300
                }

                response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

                if response.status_code == 200:
                    data = response.json()
                    if 'choices' in data and len(data['choices']) > 0 and 'message' in data['choices'][0]:
                        content = data['choices'][0]['message']['content']
                    try:
                        clean_content = content.strip().replace('```json', '').replace('```', '').replace('\n', '')
                        products = json.loads(clean_content)
                        for product in products:
                            if 'Product Name' in product and 'Product Price' in product:
                                csv_writer.writerow([product['Product Name'], product['Product Price']])
                    except json.JSONDecodeError as e:
                        print("Failed to parse JSON:", e)
                        print("Cleaned JSON content that failed:", repr(clean_content))

            else:
                continue 
    return "Successfully Updated the File"

def update_url(url_input, output_folder, api_key_input):
    client = OpenAI(api_key=api_key_input)
    current_url = url_input

    while True:
        try:
            completion = client.chat.completions.create(
                model="gpt-3.5-turbo-0301",
                messages=[
                    {"role": "system", "content": "You are a URL modifier. Given an url, you will modify it accordingly. You will not access the website"},
                    {"role": "user", "content": f'You need to modify the given url {current_url} in a way where I can access the following page. Try to identify at which part of the url, the pagination is defined and modify that part. Just provide the modified URL. You are not supposed to talk anything else with the user.'}
                ]
            )

            # Correctly extract the updated URL from the response
            updated_url = completion.choices[0].message.content

            # Check if the URL is valid
            response = requests.get(updated_url)
            if response.status_code != 200:
                print(f"Failed to access {updated_url}. Stopping the loop.")
                break

            capture_full_page_screenshots(updated_url, output_folder)
            current_url = updated_url

        except Exception as e:
            print(f"An error occurred: {e}. Stopping the loop.")
            break

    return f"Processing Completed. Screenshots saved in {output_folder}"

def process(url, output_folder, api_key,web_type_input):
    if web_type_input == "Dynamic":
        capture_full_page_screenshots(url, output_folder)
        vision(api_key, output_folder)
    else: 
        capture_full_page_screenshots(url, output_folder)
        update_url(url, output_folder,api_key)
        vision(api_key, output_folder)       

    return "Processing Completed"

# Gradio UI
url_input = gr.Textbox(label="URL")
output_folder_input = gr.Textbox(label="Output Folder Path")
api_key_input = gr.Textbox(label="API Key", type="password")
web_type_input = mode_input = gr.Dropdown(label="Mode", choices=["Dynamic", "Paginated"])


gr.Interface(
    fn=process,
    inputs=[url_input, output_folder_input, api_key_input,web_type_input ],
    outputs="text",
    title="Full Page Screenshot and OCR"
).launch()