Spaces:

oidlabs
/

Lexoid

Running

App Files Files Community

dilithjay commited on Apr 20, 2025

Commit

229b460

1 Parent(s): e13ec00

Initial commit

Browse files

Files changed (6) hide show

.gitignore +1 -0
README.md +6 -5
app.py +171 -0
leaderboard.csv +17 -0
packages.txt +12 -0
requirements.txt +2 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .venv

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
 ---
 title: Lexoid
-emoji: 🏢
-colorFrom: purple
-colorTo: yellow
 sdk: gradio
-sdk_version: 5.25.2
 app_file: app.py
 pinned: false
-short_description: An efficient document parsing library
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Lexoid
+emoji: 📄
+colorFrom: yellow
+colorTo: red
 sdk: gradio
+sdk_version: 5.25.1
 app_file: app.py
 pinned: false
+license: apache-2.0
+short_description: Try out Lexoid, an efficient document parsing library.
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import os
+import gradio as gr
+import pandas as pd
+from lexoid.api import parse
+parser_options = ["LLM_PARSE", "STATIC_PARSE", "AUTO"]
+# Function to set the environment variables and parse the document
+def run_parser(
+    file,
+    parser_type,
+    model,
+    pages_per_split,
+    max_processes,
+    as_pdf,
+    x_tolerance,
+    y_tolerance,
+    save_dir,
+    page_nums,
+    router_priority,
+    framework,
+    temperature,
+    depth,
+    google_api_key,
+    openai_api_key,
+    huggingfacehub_api_token,
+    together_api_key,
+    openrouter_api_key,
+):
+    # Set environment variables
+    os.environ["GOOGLE_API_KEY"] = google_api_key
+    os.environ["OPENAI_API_KEY"] = openai_api_key
+    os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingfacehub_api_token
+    os.environ["TOGETHER_API_KEY"] = together_api_key
+    os.environ["OPENROUTER_API_KEY"] = openrouter_api_key
+    if file is None:
+        return "Please upload a file to parse."
+    kwargs = {
+        "model": model,
+        "pages_per_split": pages_per_split,
+        "max_processes": max_processes,
+        "as_pdf": as_pdf,
+        "x_tolerance": x_tolerance,
+        "y_tolerance": y_tolerance,
+        "save_dir": save_dir,
+        "page_nums": (
+            [int(num.strip()) for num in page_nums.split(",")] if page_nums else None
+        ),
+        "router_priority": router_priority,
+        "framework": framework,
+        "temperature": temperature,
+        "depth": depth,
+    }
+    # Clean None values
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    result = parse(path=file.name, parser_type=parser_type, **kwargs)
+    if "raw" in result:
+        return result["raw"]
+    elif "segments" in result:
+        return "\n\n".join([seg.get("content", "") for seg in result["segments"]])
+    else:
+        return str(result)
+with gr.Blocks(title="Lexoid Document Parser") as app:
+    gr.Markdown(
+        "## 📄 Lexoid Document Parser\nUpload a document and customize how you'd like to parse it."
+    )
+    with gr.Row():
+        file_input = gr.File(
+            label="Upload Document",
+            file_types=[".pdf", ".docx", ".html", ".txt"],
+            type="filepath",
+        )
+        parser_type = gr.Dropdown(
+            choices=parser_options, value="AUTO", label="Parser Type"
+        )
+        model_input = gr.Textbox(value="gemini-2.0-flash", label="LLM Model")
+        framework = gr.Textbox(
+            value="pdfplumber",
+            label="Static Framework",
+            placeholder="e.g., pdfplumber, slate",
+        )
+    with gr.Accordion("Advanced Options", open=False):
+        pages_per_split = gr.Slider(
+            minimum=1, maximum=20, value=4, step=1, label="Pages per Split"
+        )
+        max_processes = gr.Slider(
+            minimum=1, maximum=16, value=4, step=1, label="Max Parallel Processes"
+        )
+        as_pdf = gr.Checkbox(label="Convert to PDF before parsing")
+        x_tolerance = gr.Number(label="X-axis Tolerance", value=None)
+        y_tolerance = gr.Number(label="Y-axis Tolerance", value=None)
+        save_dir = gr.Textbox(
+            label="Save Directory",
+            placeholder="Path to save intermediate files (optional)",
+        )
+        page_nums = gr.Textbox(
+            label="Page Numbers",
+            placeholder="Comma-separated page numbers (e.g., 1,3,5)",
+        )
+        router_priority = gr.Dropdown(
+            choices=["speed", "accuracy"], value="accuracy", label="Router Priority"
+        )
+        temperature = gr.Number(label="LLM Temperature", value=None)
+        depth = gr.Number(label="Recursive Depth", value=None)
+    # Adding the text boxes for the environment variables
+    with gr.Row():
+        google_api_key = gr.Textbox(
+            label="Google API Key", placeholder="Enter Google API Key"
+        )
+        openai_api_key = gr.Textbox(
+            label="OpenAI API Key", placeholder="Enter OpenAI API Key"
+        )
+        huggingfacehub_api_token = gr.Textbox(
+            label="HuggingFaceHub API Token",
+            placeholder="Enter HuggingFaceHub API Token",
+        )
+        together_api_key = gr.Textbox(
+            label="Together API Key", placeholder="Enter Together API Key"
+        )
+        openrouter_api_key = gr.Textbox(
+            label="OpenRouter API Key", placeholder="Enter OpenRouter API Key"
+        )
+    output = gr.Markdown(label="Parsed Output")
+    parse_button = gr.Button("Parse Document")
+    parse_button.click(
+        fn=run_parser,
+        inputs=[
+            file_input,
+            parser_type,
+            model_input,
+            pages_per_split,
+            max_processes,
+            as_pdf,
+            x_tolerance,
+            y_tolerance,
+            save_dir,
+            page_nums,
+            router_priority,
+            framework,
+            temperature,
+            depth,
+            google_api_key,
+            openai_api_key,
+            huggingfacehub_api_token,
+            together_api_key,
+            openrouter_api_key,
+        ],
+        outputs=output,
+    )
+    # Leaderboard loaded from leaderboard.csv
+    df = pd.read_csv("leaderboard.csv")
+    leaderboard = gr.Dataframe(
+        value=df,
+        label="Leaderboard",
+    )
+app.launch()

leaderboard.csv ADDED Viewed

	@@ -0,0 +1,17 @@

+Rank,Model,Mean Similarity,Std. Dev.,Time (s),Cost($)
+1,gemini-2.0-flash,0.829,0.102,7.41,0.000480
+2,gemini-2.0-flash-001,0.814,0.176,6.85,0.000421
+3,gemini-1.5-flash,0.797,0.143,9.54,0.000238
+4,gemini-2.0-pro-exp,0.764,0.227,11.95,TBA
+5,gemini-2.0-flash-thinking-exp,0.746,0.266,10.46,TBA
+6,gemini-1.5-pro,0.732,0.265,11.44,0.003332
+7,gpt-4o,0.687,0.247,10.16,0.004736
+8,gpt-4o-mini,0.642,0.213,9.71,0.000275
+9,gemma-3-27b-it (via OpenRouter),0.628,0.299,18.79,0.000096
+10,gemini-1.5-flash-8b,0.551,0.223,3.91,0.000055
+11,Llama-Vision-Free (via Together AI),0.531,0.198,6.93,0
+12,Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI),0.524,0.192,3.68,0.000060
+13,qwen/qwen-2.5-vl-7b-instruct (via OpenRouter),0.482,0.209,11.53,0.000052
+14,Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI),0.461,0.306,19.26,0.000426
+15,Llama-3.2-11B-Vision-Instruct (via Hugging Face),0.451,0.257,4.54,0
+16,microsoft/phi-4-multimodal-instruct (via OpenRouter),0.366,0.287,10.80,0.000019

packages.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+libnss3-dev
+libxcomposite1
+libxcursor1
+libxdamage1
+libxi6
+libxtst6
+libnss3
+libxrandr2
+libasound2
+libpangocairo-1.0-0
+libatk1.0-0
+libgtk-3-0

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ lexoid==0.1.12
2	+ matplotlib==3.10.1