Spaces:

PyQuarX
/

scrape-with-ai

Paused

File size: 4,815 Bytes

7d633ab
3e80e9e
 
7d633ab
3e80e9e
7d633ab
 
 
3e80e9e
c6aa35b
3e80e9e
 
c6aa35b
45efca9
3e80e9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d633ab
 
 
3e80e9e
 
7d633ab
 
 
 
 
3e80e9e
7d633ab
3e80e9e
7d633ab
3e80e9e
7d633ab
 
 
90924f6
 
3e80e9e
 
 
90924f6
 
 
7d633ab
90924f6
 
 
7d633ab
 
 
 
 
 
 
 
 
 
 
 
90924f6
 
3e80e9e
90924f6
3e80e9e

from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
import os
import pandas as pd



# Load OpenRouter API Key
#openrouter_api_key = os.getenv("API_MV")

model = ChatOpenAI(
    openai_api_key="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",  # Use OpenRouter API key
    model="google/gemini-2.0-flash-exp:free",  # Specify Qwen VL Plus model
    base_url="https://openrouter.ai/api/v1"  # OpenRouter API URL
)

# Create a chat prompt template
template = (
    "You are tasked with extracting specific information from the following text content: {dom_content}. "
    "Please follow these instructions carefully:\n\n"
    "1. **Task:** Extract data from the provided text that matches the description: {parse_description}.\n"
    "2. **Output Format:** Return the extracted data ONLY as one or more Markdown tables. Each table MUST be correctly formatted.\n"
    "3. **Markdown Table Format:** Each table must adhere to the following Markdown format:\n"
    "   - Start with a header row, clearly labeling each column, separated by pipes (|).\n"
    "   - Follow the header row with an alignment row, using hyphens (-) to indicate column alignment (e.g., --- for left alignment).\n"
    "   - Subsequent rows should contain the data, with cells aligned according to the alignment row.\n"
    "   - Use pipes (|) to separate columns in each data row.\n"
    "4. **No Explanations:** Do not include any introductory or explanatory text before or after the table(s).\n"
    "5. **Empty Response:** If no information matches the description, return an empty string ('').\n"
    "6. **Multiple Tables:** If the text contains multiple tables matching the description, return each table separately, following the Markdown format for each.\n"
    "7. **Accuracy:** The extracted data must be accurate and reflect the information in the provided text.\n"
)

# Function to parse and extract information from the chunks
def parse(dom_chunks, parse_description):
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model

    parsed_results = []

    # Loop through the chunks and parse
    for i, chunk in enumerate(dom_chunks, start=1):
        response = chain.invoke({"dom_content": chunk, "parse_description": parse_description})

        # Extract the content from AIMessage and add it to the results
        print(f"Parsed batch {i} of {len(dom_chunks)}")
        parsed_results.append(response.content)  # Ensure content is extracted properly

    # Return the parsed results as a single string
    return "\n".join(parsed_results)

def merge_tables_with_llm(tables, parse_description):
    """Merges a list of Pandas DataFrames into a single Markdown table using LLM."""
    from langchain_core.prompts import ChatPromptTemplate
    from langchain_openai import ChatOpenAI
    # Convert DataFrames to Markdown strings
    table_strings = [table.to_markdown(index=False) for table in tables]

    # Create a prompt for the LLM
    merge_prompt = (
        "You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
        "The tables contain information related to: {parse_description}.\n"
        "Please follow these instructions carefully:\n\n"
        "1. **Task:** Merge the data from the following tables into a single table that matches the description: {parse_description}.\n"
        "2. **Output Format:** Return the merged data ONLY as a single Markdown table. The table MUST be correctly formatted.\n"
        "3. **Markdown Table Format:** The table must adhere to the following Markdown format:\n"
        "   - Start with a header row, clearly labeling each column, separated by pipes (|).\n"
        "   - Follow the header row with an alignment row, using hyphens (-) to indicate column alignment (e.g., --- for left alignment).\n"
        "   - Subsequent rows should contain the data, with cells aligned according to the alignment row.\n"
        "   - Use pipes (|) to separate columns in each data row.\n"
        "4. **No Explanations:** Do not include any introductory or explanatory text before or after the table.\n"
        "5. **Empty Response:** If no information matches the description, return an empty string ('') if no data can be merged.\n"
        "6. **Duplicate Columns:** If there are duplicate columns, rename them to be unique.\n"
        "7. **Missing Values:** If there are missing values, fill them with 'N/A'.\n\n"
        "Here are the tables:\n\n" + "\n\n".join(table_strings) +
        "\n\nReturn the merged table in Markdown format:"
    )

    # Invoke the LLM
    message = HumanMessage(content=merge_prompt)
    response = model.invoke([message])
    return response.content