Spaces:
Sleeping
Sleeping
Commit
·
08fabf7
1
Parent(s):
f79ba24
feat: add docling support
Browse files- app.py +72 -19
- demo.py +8 -0
- requirements.in +4 -2
- requirements.txt +6 -438
app.py
CHANGED
|
@@ -2,12 +2,24 @@ import logging
|
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
import gradio as gr
|
|
|
|
| 5 |
from datasets import Dataset
|
| 6 |
from gradio_log import Log
|
| 7 |
from huggingface_hub import DatasetCard
|
| 8 |
-
from llama_index.core import SimpleDirectoryReader
|
| 9 |
from llama_index.core.node_parser import SentenceSplitter
|
|
|
|
| 10 |
from llama_index.core.schema import MetadataMode
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
from tqdm.auto import tqdm
|
| 12 |
|
| 13 |
log_file = "logs.txt"
|
|
@@ -22,8 +34,40 @@ def load_corpus(
|
|
| 22 |
):
|
| 23 |
if verbose:
|
| 24 |
gr.Info("Loading files...")
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
if split_sentences is False:
|
| 28 |
gr.Info(
|
| 29 |
"Skipping sentence splitting. Each file will be a single row in the dataset."
|
|
@@ -61,7 +105,10 @@ def upload_and_preview(
|
|
| 61 |
split_sentences: bool = True,
|
| 62 |
):
|
| 63 |
print("loading files")
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
print("parsing into sentences")
|
| 67 |
corpus = load_corpus(
|
|
@@ -159,17 +206,18 @@ def update_dataset_card(
|
|
| 159 |
description = """Corpus Creator is a tool for transforming a collection of text files into a Hugging Face dataset, perfect for various natural language processing (NLP) tasks. Whether you're preparing data for synthetic generation, building pipelines, or setting up annotation tasks, this app simplifies the process.
|
| 160 |
|
| 161 |
Key features:
|
| 162 |
-
-
|
| 163 |
- ✂️ Customizable text chunking
|
| 164 |
- 👁️ Instant dataset preview
|
| 165 |
-
- 🚀 One-click upload to Hugging Face
|
| 166 |
|
| 167 |
-
#### Powered by Llama Index
|
| 168 |
|
| 169 |
-
Corpus Creator leverages the power of Llama Index, a data framework for LLM-based applications. Specifically, we use Llama Index's `SentenceSplitter` class to intelligently chunk your text. This ensures that your dataset is split in a way that preserves semantic meaning, making it ideal for downstream NLP tasks. [Learn more about Llama Index](https://www.llamaindex.ai/)
|
| 170 |
|
|
|
|
| 171 |
|
| 172 |
-
Get started by uploading your files and see your corpus take shape!
|
| 173 |
|
| 174 |
[View an example dataset](https://huggingface.co/datasets/davanstrien/MOH-Bethnal-Green) created with Corpus Creator.
|
| 175 |
"""
|
|
@@ -189,14 +237,19 @@ with gr.Blocks() as demo:
|
|
| 189 |
gr.Markdown(
|
| 190 |
"### 1. Upload Files\nClick 'Upload Files' to select text file(s). A preview will generate automatically"
|
| 191 |
)
|
| 192 |
-
with gr.
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
gr.Markdown("""
|
| 201 |
### 2. Adjust Parameters for Chunking Text (Optional)
|
| 202 |
Customize the chunk size, overlap, and sentence splitting option according to your requirements.
|
|
@@ -238,8 +291,8 @@ with gr.Blocks() as demo:
|
|
| 238 |
with gr.Accordion("detailed logs", open=False):
|
| 239 |
Log(log_file, dark=True, xterm_font_size=12)
|
| 240 |
|
| 241 |
-
|
| 242 |
-
|
| 243 |
inputs=[upload_button, chunk_size, chunk_overlap, split_sentences],
|
| 244 |
outputs=[state, corpus_preview_df, preview_summary],
|
| 245 |
)
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
+
import pandas as pd
|
| 6 |
from datasets import Dataset
|
| 7 |
from gradio_log import Log
|
| 8 |
from huggingface_hub import DatasetCard
|
|
|
|
| 9 |
from llama_index.core.node_parser import SentenceSplitter
|
| 10 |
+
from llama_index.core.readers import SimpleDirectoryReader
|
| 11 |
from llama_index.core.schema import MetadataMode
|
| 12 |
+
from llama_index.readers.docling import DoclingReader
|
| 13 |
+
from llama_index.readers.file import (
|
| 14 |
+
EpubReader,
|
| 15 |
+
HWPReader,
|
| 16 |
+
ImageReader,
|
| 17 |
+
IPYNBReader,
|
| 18 |
+
MboxReader,
|
| 19 |
+
PandasCSVReader,
|
| 20 |
+
PandasExcelReader,
|
| 21 |
+
VideoAudioReader,
|
| 22 |
+
)
|
| 23 |
from tqdm.auto import tqdm
|
| 24 |
|
| 25 |
log_file = "logs.txt"
|
|
|
|
| 34 |
):
|
| 35 |
if verbose:
|
| 36 |
gr.Info("Loading files...")
|
| 37 |
+
|
| 38 |
+
docling_reader = DoclingReader()
|
| 39 |
+
try:
|
| 40 |
+
docs = []
|
| 41 |
+
for file in files:
|
| 42 |
+
docs.extend(docling_reader.load_data(file))
|
| 43 |
+
except Exception:
|
| 44 |
+
reader = SimpleDirectoryReader(
|
| 45 |
+
input_files=files,
|
| 46 |
+
file_extractor={
|
| 47 |
+
".hwp": HWPReader,
|
| 48 |
+
".pdf": docling_reader,
|
| 49 |
+
".docx": docling_reader,
|
| 50 |
+
".pptx": docling_reader,
|
| 51 |
+
".ppt": docling_reader,
|
| 52 |
+
".pptm": docling_reader,
|
| 53 |
+
".gif": ImageReader,
|
| 54 |
+
".jpg": ImageReader,
|
| 55 |
+
".png": ImageReader,
|
| 56 |
+
".jpeg": ImageReader,
|
| 57 |
+
".webp": ImageReader,
|
| 58 |
+
".mp3": VideoAudioReader,
|
| 59 |
+
".mp4": VideoAudioReader,
|
| 60 |
+
".csv": PandasCSVReader,
|
| 61 |
+
".epub": EpubReader,
|
| 62 |
+
".md": docling_reader,
|
| 63 |
+
".mbox": MboxReader,
|
| 64 |
+
".ipynb": IPYNBReader,
|
| 65 |
+
".xls": PandasExcelReader,
|
| 66 |
+
".xlsx": PandasExcelReader,
|
| 67 |
+
},
|
| 68 |
+
)
|
| 69 |
+
docs = reader.load_data()
|
| 70 |
+
|
| 71 |
if split_sentences is False:
|
| 72 |
gr.Info(
|
| 73 |
"Skipping sentence splitting. Each file will be a single row in the dataset."
|
|
|
|
| 105 |
split_sentences: bool = True,
|
| 106 |
):
|
| 107 |
print("loading files")
|
| 108 |
+
if isinstance(files, pd.DataFrame):
|
| 109 |
+
file_paths = files["urls"].tolist()
|
| 110 |
+
else:
|
| 111 |
+
file_paths = [file.name for file in files]
|
| 112 |
|
| 113 |
print("parsing into sentences")
|
| 114 |
corpus = load_corpus(
|
|
|
|
| 206 |
description = """Corpus Creator is a tool for transforming a collection of text files into a Hugging Face dataset, perfect for various natural language processing (NLP) tasks. Whether you're preparing data for synthetic generation, building pipelines, or setting up annotation tasks, this app simplifies the process.
|
| 207 |
|
| 208 |
Key features:
|
| 209 |
+
- 🗂️ Reads popular document formats (PDF, DOCX, PPTX, HTML, AsciiDoc, Markdown)
|
| 210 |
- ✂️ Customizable text chunking
|
| 211 |
- 👁️ Instant dataset preview
|
| 212 |
+
- 🚀 One-click upload to Hugging Face Hub
|
| 213 |
|
| 214 |
+
#### Powered by Llama Index and Docling
|
| 215 |
|
| 216 |
+
Corpus Creator leverages the power of Llama Index, a data framework for LLM-based applications. Specifically, we use Llama Index's `SentenceSplitter` class to intelligently chunk your text. This ensures that your dataset is split in a way that preserves semantic meaning, making it ideal for downstream NLP tasks. [Learn more about Llama Index](https://www.llamaindex.ai/).
|
| 217 |
|
| 218 |
+
Docling is a tool for converting documents to text. It supports a wide range of document formats, including PDF, DOCX, PPTX, Images, HTML, AsciiDoc, and Markdown. [Learn more about Docling](https://ds4sd.github.io/docling/).
|
| 219 |
|
| 220 |
+
Get started by uploading your files and see your corpus take shape!
|
| 221 |
|
| 222 |
[View an example dataset](https://huggingface.co/datasets/davanstrien/MOH-Bethnal-Green) created with Corpus Creator.
|
| 223 |
"""
|
|
|
|
| 237 |
gr.Markdown(
|
| 238 |
"### 1. Upload Files\nClick 'Upload Files' to select text file(s). A preview will generate automatically"
|
| 239 |
)
|
| 240 |
+
with gr.Tab():
|
| 241 |
+
with gr.Row():
|
| 242 |
+
upload_button = gr.File(
|
| 243 |
+
file_types=["text"],
|
| 244 |
+
file_count="multiple",
|
| 245 |
+
height=50,
|
| 246 |
+
interactive=True,
|
| 247 |
+
label="Upload Files",
|
| 248 |
+
)
|
| 249 |
+
with gr.Tab():
|
| 250 |
+
with gr.Row():
|
| 251 |
+
urls = gr.Dataframe(label="URL", headers=["urls"], interactive=True)
|
| 252 |
+
upload_button_files = gr.Button("Upload URLs")
|
| 253 |
gr.Markdown("""
|
| 254 |
### 2. Adjust Parameters for Chunking Text (Optional)
|
| 255 |
Customize the chunk size, overlap, and sentence splitting option according to your requirements.
|
|
|
|
| 291 |
with gr.Accordion("detailed logs", open=False):
|
| 292 |
Log(log_file, dark=True, xterm_font_size=12)
|
| 293 |
|
| 294 |
+
gr.on(
|
| 295 |
+
triggers=[upload_button.upload, upload_button_files.click],
|
| 296 |
inputs=[upload_button, chunk_size, chunk_overlap, split_sentences],
|
| 297 |
outputs=[state, corpus_preview_df, preview_summary],
|
| 298 |
)
|
demo.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from docling.document_converter import DocumentConverter
|
| 2 |
+
|
| 3 |
+
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
| 4 |
+
converter = DocumentConverter()
|
| 5 |
+
result = converter.convert(source)
|
| 6 |
+
print(
|
| 7 |
+
result.document.export_to_markdown()
|
| 8 |
+
) # output: "### Docling Technical Report[...]"
|
requirements.in
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
datasets
|
| 2 |
-
gradio[oauth]
|
| 3 |
gradio_log
|
| 4 |
-
llama_index
|
|
|
|
|
|
|
|
|
| 1 |
datasets
|
| 2 |
+
gradio[oauth]<5
|
| 3 |
gradio_log
|
| 4 |
+
llama_index==0.11.22
|
| 5 |
+
docling
|
| 6 |
+
llama-index-readers-docling
|
requirements.txt
CHANGED
|
@@ -1,438 +1,6 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
# datasets
|
| 8 |
-
# fsspec
|
| 9 |
-
# llama-index-core
|
| 10 |
-
# llama-index-legacy
|
| 11 |
-
aiosignal==1.3.1
|
| 12 |
-
# via aiohttp
|
| 13 |
-
altair==5.3.0
|
| 14 |
-
# via gradio
|
| 15 |
-
annotated-types==0.7.0
|
| 16 |
-
# via pydantic
|
| 17 |
-
anyio==4.4.0
|
| 18 |
-
# via
|
| 19 |
-
# httpx
|
| 20 |
-
# openai
|
| 21 |
-
# starlette
|
| 22 |
-
# watchfiles
|
| 23 |
-
attrs==23.2.0
|
| 24 |
-
# via
|
| 25 |
-
# aiohttp
|
| 26 |
-
# jsonschema
|
| 27 |
-
# referencing
|
| 28 |
-
authlib==1.3.1
|
| 29 |
-
# via gradio
|
| 30 |
-
beautifulsoup4==4.12.3
|
| 31 |
-
# via llama-index-readers-file
|
| 32 |
-
certifi==2024.6.2
|
| 33 |
-
# via
|
| 34 |
-
# httpcore
|
| 35 |
-
# httpx
|
| 36 |
-
# requests
|
| 37 |
-
cffi==1.16.0
|
| 38 |
-
# via cryptography
|
| 39 |
-
charset-normalizer==3.3.2
|
| 40 |
-
# via requests
|
| 41 |
-
click==8.1.7
|
| 42 |
-
# via
|
| 43 |
-
# nltk
|
| 44 |
-
# typer
|
| 45 |
-
# uvicorn
|
| 46 |
-
contourpy==1.2.1
|
| 47 |
-
# via matplotlib
|
| 48 |
-
cryptography==42.0.8
|
| 49 |
-
# via authlib
|
| 50 |
-
cycler==0.12.1
|
| 51 |
-
# via matplotlib
|
| 52 |
-
dataclasses-json==0.6.7
|
| 53 |
-
# via
|
| 54 |
-
# llama-index-core
|
| 55 |
-
# llama-index-legacy
|
| 56 |
-
datasets==2.20.0
|
| 57 |
-
# via -r requirements.in
|
| 58 |
-
deprecated==1.2.14
|
| 59 |
-
# via
|
| 60 |
-
# llama-index-core
|
| 61 |
-
# llama-index-legacy
|
| 62 |
-
dill==0.3.8
|
| 63 |
-
# via
|
| 64 |
-
# datasets
|
| 65 |
-
# multiprocess
|
| 66 |
-
dirtyjson==1.0.8
|
| 67 |
-
# via
|
| 68 |
-
# llama-index-core
|
| 69 |
-
# llama-index-legacy
|
| 70 |
-
distro==1.9.0
|
| 71 |
-
# via openai
|
| 72 |
-
dnspython==2.6.1
|
| 73 |
-
# via email-validator
|
| 74 |
-
email-validator==2.1.2
|
| 75 |
-
# via fastapi
|
| 76 |
-
fastapi==0.111.0
|
| 77 |
-
# via gradio
|
| 78 |
-
fastapi-cli==0.0.4
|
| 79 |
-
# via fastapi
|
| 80 |
-
ffmpy==0.3.2
|
| 81 |
-
# via gradio
|
| 82 |
-
filelock==3.15.1
|
| 83 |
-
# via
|
| 84 |
-
# datasets
|
| 85 |
-
# huggingface-hub
|
| 86 |
-
fonttools==4.53.0
|
| 87 |
-
# via matplotlib
|
| 88 |
-
frozenlist==1.4.1
|
| 89 |
-
# via
|
| 90 |
-
# aiohttp
|
| 91 |
-
# aiosignal
|
| 92 |
-
fsspec==2024.5.0
|
| 93 |
-
# via
|
| 94 |
-
# datasets
|
| 95 |
-
# gradio-client
|
| 96 |
-
# huggingface-hub
|
| 97 |
-
# llama-index-core
|
| 98 |
-
# llama-index-legacy
|
| 99 |
-
gradio==4.36.1
|
| 100 |
-
# via
|
| 101 |
-
# -r requirements.in
|
| 102 |
-
# gradio-log
|
| 103 |
-
gradio-client==1.0.1
|
| 104 |
-
# via gradio
|
| 105 |
-
gradio-log==0.0.4
|
| 106 |
-
# via -r requirements.in
|
| 107 |
-
greenlet==3.0.3
|
| 108 |
-
# via sqlalchemy
|
| 109 |
-
h11==0.14.0
|
| 110 |
-
# via
|
| 111 |
-
# httpcore
|
| 112 |
-
# uvicorn
|
| 113 |
-
httpcore==1.0.5
|
| 114 |
-
# via httpx
|
| 115 |
-
httptools==0.6.1
|
| 116 |
-
# via uvicorn
|
| 117 |
-
httpx==0.27.0
|
| 118 |
-
# via
|
| 119 |
-
# fastapi
|
| 120 |
-
# gradio
|
| 121 |
-
# gradio-client
|
| 122 |
-
# llama-index-core
|
| 123 |
-
# llama-index-legacy
|
| 124 |
-
# llamaindex-py-client
|
| 125 |
-
# openai
|
| 126 |
-
huggingface-hub==0.23.4
|
| 127 |
-
# via
|
| 128 |
-
# datasets
|
| 129 |
-
# gradio
|
| 130 |
-
# gradio-client
|
| 131 |
-
idna==3.7
|
| 132 |
-
# via
|
| 133 |
-
# anyio
|
| 134 |
-
# email-validator
|
| 135 |
-
# httpx
|
| 136 |
-
# requests
|
| 137 |
-
# yarl
|
| 138 |
-
importlib-resources==6.4.0
|
| 139 |
-
# via gradio
|
| 140 |
-
itsdangerous==2.2.0
|
| 141 |
-
# via gradio
|
| 142 |
-
jinja2==3.1.4
|
| 143 |
-
# via
|
| 144 |
-
# altair
|
| 145 |
-
# fastapi
|
| 146 |
-
# gradio
|
| 147 |
-
joblib==1.4.2
|
| 148 |
-
# via nltk
|
| 149 |
-
jsonschema==4.22.0
|
| 150 |
-
# via altair
|
| 151 |
-
jsonschema-specifications==2023.12.1
|
| 152 |
-
# via jsonschema
|
| 153 |
-
kiwisolver==1.4.5
|
| 154 |
-
# via matplotlib
|
| 155 |
-
llama-index==0.10.45
|
| 156 |
-
# via -r requirements.in
|
| 157 |
-
llama-index-agent-openai==0.2.7
|
| 158 |
-
# via
|
| 159 |
-
# llama-index
|
| 160 |
-
# llama-index-program-openai
|
| 161 |
-
llama-index-cli==0.1.12
|
| 162 |
-
# via llama-index
|
| 163 |
-
llama-index-core==0.10.44
|
| 164 |
-
# via
|
| 165 |
-
# llama-index
|
| 166 |
-
# llama-index-agent-openai
|
| 167 |
-
# llama-index-cli
|
| 168 |
-
# llama-index-embeddings-openai
|
| 169 |
-
# llama-index-indices-managed-llama-cloud
|
| 170 |
-
# llama-index-llms-openai
|
| 171 |
-
# llama-index-multi-modal-llms-openai
|
| 172 |
-
# llama-index-program-openai
|
| 173 |
-
# llama-index-question-gen-openai
|
| 174 |
-
# llama-index-readers-file
|
| 175 |
-
# llama-index-readers-llama-parse
|
| 176 |
-
# llama-parse
|
| 177 |
-
llama-index-embeddings-openai==0.1.10
|
| 178 |
-
# via
|
| 179 |
-
# llama-index
|
| 180 |
-
# llama-index-cli
|
| 181 |
-
llama-index-indices-managed-llama-cloud==0.1.6
|
| 182 |
-
# via llama-index
|
| 183 |
-
llama-index-legacy==0.9.48
|
| 184 |
-
# via llama-index
|
| 185 |
-
llama-index-llms-openai==0.1.22
|
| 186 |
-
# via
|
| 187 |
-
# llama-index
|
| 188 |
-
# llama-index-agent-openai
|
| 189 |
-
# llama-index-cli
|
| 190 |
-
# llama-index-multi-modal-llms-openai
|
| 191 |
-
# llama-index-program-openai
|
| 192 |
-
# llama-index-question-gen-openai
|
| 193 |
-
llama-index-multi-modal-llms-openai==0.1.6
|
| 194 |
-
# via llama-index
|
| 195 |
-
llama-index-program-openai==0.1.6
|
| 196 |
-
# via
|
| 197 |
-
# llama-index
|
| 198 |
-
# llama-index-question-gen-openai
|
| 199 |
-
llama-index-question-gen-openai==0.1.3
|
| 200 |
-
# via llama-index
|
| 201 |
-
llama-index-readers-file==0.1.25
|
| 202 |
-
# via llama-index
|
| 203 |
-
llama-index-readers-llama-parse==0.1.4
|
| 204 |
-
# via llama-index
|
| 205 |
-
llama-parse==0.4.4
|
| 206 |
-
# via llama-index-readers-llama-parse
|
| 207 |
-
llamaindex-py-client==0.1.19
|
| 208 |
-
# via
|
| 209 |
-
# llama-index-core
|
| 210 |
-
# llama-index-indices-managed-llama-cloud
|
| 211 |
-
markdown-it-py==3.0.0
|
| 212 |
-
# via rich
|
| 213 |
-
markupsafe==2.1.5
|
| 214 |
-
# via
|
| 215 |
-
# gradio
|
| 216 |
-
# jinja2
|
| 217 |
-
marshmallow==3.21.3
|
| 218 |
-
# via dataclasses-json
|
| 219 |
-
matplotlib==3.9.0
|
| 220 |
-
# via gradio
|
| 221 |
-
mdurl==0.1.2
|
| 222 |
-
# via markdown-it-py
|
| 223 |
-
multidict==6.0.5
|
| 224 |
-
# via
|
| 225 |
-
# aiohttp
|
| 226 |
-
# yarl
|
| 227 |
-
multiprocess==0.70.16
|
| 228 |
-
# via datasets
|
| 229 |
-
mypy-extensions==1.0.0
|
| 230 |
-
# via typing-inspect
|
| 231 |
-
nest-asyncio==1.6.0
|
| 232 |
-
# via
|
| 233 |
-
# llama-index-core
|
| 234 |
-
# llama-index-legacy
|
| 235 |
-
networkx==3.3
|
| 236 |
-
# via
|
| 237 |
-
# llama-index-core
|
| 238 |
-
# llama-index-legacy
|
| 239 |
-
nltk==3.8.1
|
| 240 |
-
# via
|
| 241 |
-
# llama-index-core
|
| 242 |
-
# llama-index-legacy
|
| 243 |
-
numpy==2.0.0
|
| 244 |
-
# via
|
| 245 |
-
# altair
|
| 246 |
-
# contourpy
|
| 247 |
-
# datasets
|
| 248 |
-
# gradio
|
| 249 |
-
# llama-index-core
|
| 250 |
-
# llama-index-legacy
|
| 251 |
-
# matplotlib
|
| 252 |
-
# pandas
|
| 253 |
-
# pyarrow
|
| 254 |
-
openai==1.34.0
|
| 255 |
-
# via
|
| 256 |
-
# llama-index-agent-openai
|
| 257 |
-
# llama-index-core
|
| 258 |
-
# llama-index-legacy
|
| 259 |
-
orjson==3.10.5
|
| 260 |
-
# via
|
| 261 |
-
# fastapi
|
| 262 |
-
# gradio
|
| 263 |
-
packaging==24.1
|
| 264 |
-
# via
|
| 265 |
-
# altair
|
| 266 |
-
# datasets
|
| 267 |
-
# gradio
|
| 268 |
-
# gradio-client
|
| 269 |
-
# huggingface-hub
|
| 270 |
-
# marshmallow
|
| 271 |
-
# matplotlib
|
| 272 |
-
pandas==2.2.2
|
| 273 |
-
# via
|
| 274 |
-
# altair
|
| 275 |
-
# datasets
|
| 276 |
-
# gradio
|
| 277 |
-
# llama-index-core
|
| 278 |
-
# llama-index-legacy
|
| 279 |
-
pillow==10.3.0
|
| 280 |
-
# via
|
| 281 |
-
# gradio
|
| 282 |
-
# llama-index-core
|
| 283 |
-
# matplotlib
|
| 284 |
-
pyarrow==16.1.0
|
| 285 |
-
# via datasets
|
| 286 |
-
pyarrow-hotfix==0.6
|
| 287 |
-
# via datasets
|
| 288 |
-
pycparser==2.22
|
| 289 |
-
# via cffi
|
| 290 |
-
pydantic==2.7.4
|
| 291 |
-
# via
|
| 292 |
-
# fastapi
|
| 293 |
-
# gradio
|
| 294 |
-
# llamaindex-py-client
|
| 295 |
-
# openai
|
| 296 |
-
pydantic-core==2.18.4
|
| 297 |
-
# via pydantic
|
| 298 |
-
pydub==0.25.1
|
| 299 |
-
# via gradio
|
| 300 |
-
pygments==2.18.0
|
| 301 |
-
# via rich
|
| 302 |
-
pyparsing==3.1.2
|
| 303 |
-
# via matplotlib
|
| 304 |
-
pypdf==4.2.0
|
| 305 |
-
# via llama-index-readers-file
|
| 306 |
-
python-dateutil==2.9.0.post0
|
| 307 |
-
# via
|
| 308 |
-
# matplotlib
|
| 309 |
-
# pandas
|
| 310 |
-
python-dotenv==1.0.1
|
| 311 |
-
# via uvicorn
|
| 312 |
-
python-multipart==0.0.9
|
| 313 |
-
# via
|
| 314 |
-
# fastapi
|
| 315 |
-
# gradio
|
| 316 |
-
pytz==2024.1
|
| 317 |
-
# via pandas
|
| 318 |
-
pyyaml==6.0.1
|
| 319 |
-
# via
|
| 320 |
-
# datasets
|
| 321 |
-
# gradio
|
| 322 |
-
# huggingface-hub
|
| 323 |
-
# llama-index-core
|
| 324 |
-
# uvicorn
|
| 325 |
-
referencing==0.35.1
|
| 326 |
-
# via
|
| 327 |
-
# jsonschema
|
| 328 |
-
# jsonschema-specifications
|
| 329 |
-
regex==2024.5.15
|
| 330 |
-
# via
|
| 331 |
-
# nltk
|
| 332 |
-
# tiktoken
|
| 333 |
-
requests==2.32.3
|
| 334 |
-
# via
|
| 335 |
-
# datasets
|
| 336 |
-
# huggingface-hub
|
| 337 |
-
# llama-index-core
|
| 338 |
-
# llama-index-legacy
|
| 339 |
-
# tiktoken
|
| 340 |
-
rich==13.7.1
|
| 341 |
-
# via typer
|
| 342 |
-
rpds-py==0.18.1
|
| 343 |
-
# via
|
| 344 |
-
# jsonschema
|
| 345 |
-
# referencing
|
| 346 |
-
ruff==0.4.9
|
| 347 |
-
# via gradio
|
| 348 |
-
semantic-version==2.10.0
|
| 349 |
-
# via gradio
|
| 350 |
-
shellingham==1.5.4
|
| 351 |
-
# via typer
|
| 352 |
-
six==1.16.0
|
| 353 |
-
# via python-dateutil
|
| 354 |
-
sniffio==1.3.1
|
| 355 |
-
# via
|
| 356 |
-
# anyio
|
| 357 |
-
# httpx
|
| 358 |
-
# openai
|
| 359 |
-
soupsieve==2.5
|
| 360 |
-
# via beautifulsoup4
|
| 361 |
-
sqlalchemy==2.0.30
|
| 362 |
-
# via
|
| 363 |
-
# llama-index-core
|
| 364 |
-
# llama-index-legacy
|
| 365 |
-
starlette==0.37.2
|
| 366 |
-
# via fastapi
|
| 367 |
-
striprtf==0.0.26
|
| 368 |
-
# via llama-index-readers-file
|
| 369 |
-
tenacity==8.4.1
|
| 370 |
-
# via
|
| 371 |
-
# llama-index-core
|
| 372 |
-
# llama-index-legacy
|
| 373 |
-
tiktoken==0.7.0
|
| 374 |
-
# via
|
| 375 |
-
# llama-index-core
|
| 376 |
-
# llama-index-legacy
|
| 377 |
-
tomlkit==0.12.0
|
| 378 |
-
# via gradio
|
| 379 |
-
toolz==0.12.1
|
| 380 |
-
# via altair
|
| 381 |
-
tqdm==4.66.4
|
| 382 |
-
# via
|
| 383 |
-
# datasets
|
| 384 |
-
# huggingface-hub
|
| 385 |
-
# llama-index-core
|
| 386 |
-
# nltk
|
| 387 |
-
# openai
|
| 388 |
-
typer==0.12.3
|
| 389 |
-
# via
|
| 390 |
-
# fastapi-cli
|
| 391 |
-
# gradio
|
| 392 |
-
typing-extensions==4.12.2
|
| 393 |
-
# via
|
| 394 |
-
# fastapi
|
| 395 |
-
# gradio
|
| 396 |
-
# gradio-client
|
| 397 |
-
# huggingface-hub
|
| 398 |
-
# llama-index-core
|
| 399 |
-
# llama-index-legacy
|
| 400 |
-
# openai
|
| 401 |
-
# pydantic
|
| 402 |
-
# pydantic-core
|
| 403 |
-
# sqlalchemy
|
| 404 |
-
# typer
|
| 405 |
-
# typing-inspect
|
| 406 |
-
typing-inspect==0.9.0
|
| 407 |
-
# via
|
| 408 |
-
# dataclasses-json
|
| 409 |
-
# llama-index-core
|
| 410 |
-
# llama-index-legacy
|
| 411 |
-
tzdata==2024.1
|
| 412 |
-
# via pandas
|
| 413 |
-
ujson==5.10.0
|
| 414 |
-
# via fastapi
|
| 415 |
-
urllib3==2.2.2
|
| 416 |
-
# via
|
| 417 |
-
# gradio
|
| 418 |
-
# requests
|
| 419 |
-
uvicorn==0.30.1
|
| 420 |
-
# via
|
| 421 |
-
# fastapi
|
| 422 |
-
# gradio
|
| 423 |
-
uvloop==0.19.0
|
| 424 |
-
# via uvicorn
|
| 425 |
-
watchfiles==0.22.0
|
| 426 |
-
# via uvicorn
|
| 427 |
-
websockets==11.0.3
|
| 428 |
-
# via
|
| 429 |
-
# gradio-client
|
| 430 |
-
# uvicorn
|
| 431 |
-
wrapt==1.16.0
|
| 432 |
-
# via
|
| 433 |
-
# deprecated
|
| 434 |
-
# llama-index-core
|
| 435 |
-
xxhash==3.4.1
|
| 436 |
-
# via datasets
|
| 437 |
-
yarl==1.9.4
|
| 438 |
-
# via aiohttp
|
|
|
|
| 1 |
+
datasets
|
| 2 |
+
gradio[oauth]<5
|
| 3 |
+
gradio_log
|
| 4 |
+
llama_index==0.11.22
|
| 5 |
+
docling
|
| 6 |
+
llama-index-readers-docling
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|