Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files- app.css +182 -0
- app.py +335 -0
- requirements.txt +2 -0
app.css
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* code highlight: https://python-markdown.github.io/extensions/code_hilite/ */
|
| 2 |
+
.codehilite .hll { background-color: #ffffcc }
|
| 3 |
+
.codehilite { background: #f8f8f8; }
|
| 4 |
+
.codehilite .c { color: #408080; font-style: italic } /* Comment */
|
| 5 |
+
.codehilite .err { border: 1px solid #FF0000 } /* Error */
|
| 6 |
+
.codehilite .k { color: #008000; font-weight: bold } /* Keyword */
|
| 7 |
+
.codehilite .o { color: #666666 } /* Operator */
|
| 8 |
+
.codehilite .ch { color: #408080; font-style: italic } /* Comment.Hashbang */
|
| 9 |
+
.codehilite .cm { color: #408080; font-style: italic } /* Comment.Multiline */
|
| 10 |
+
.codehilite .cp { color: #BC7A00 } /* Comment.Preproc */
|
| 11 |
+
.codehilite .cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */
|
| 12 |
+
.codehilite .c1 { color: #408080; font-style: italic } /* Comment.Single */
|
| 13 |
+
.codehilite .cs { color: #408080; font-style: italic } /* Comment.Special */
|
| 14 |
+
.codehilite .gd { color: #A00000 } /* Generic.Deleted */
|
| 15 |
+
.codehilite .ge { font-style: italic } /* Generic.Emph */
|
| 16 |
+
.codehilite .gr { color: #FF0000 } /* Generic.Error */
|
| 17 |
+
.codehilite .gh { color: #000080; font-weight: bold } /* Generic.Heading */
|
| 18 |
+
.codehilite .gi { color: #00A000 } /* Generic.Inserted */
|
| 19 |
+
.codehilite .go { color: #888888 } /* Generic.Output */
|
| 20 |
+
.codehilite .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
|
| 21 |
+
.codehilite .gs { font-weight: bold } /* Generic.Strong */
|
| 22 |
+
.codehilite .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
|
| 23 |
+
.codehilite .gt { color: #0044DD } /* Generic.Traceback */
|
| 24 |
+
.codehilite .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
|
| 25 |
+
.codehilite .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
|
| 26 |
+
.codehilite .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
|
| 27 |
+
.codehilite .kp { color: #008000 } /* Keyword.Pseudo */
|
| 28 |
+
.codehilite .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
|
| 29 |
+
.codehilite .kt { color: #B00040 } /* Keyword.Type */
|
| 30 |
+
.codehilite .m { color: #666666 } /* Literal.Number */
|
| 31 |
+
.codehilite .s { color: #BA2121 } /* Literal.String */
|
| 32 |
+
.codehilite .na { color: #7D9029 } /* Name.Attribute */
|
| 33 |
+
.codehilite .nb { color: #008000 } /* Name.Builtin */
|
| 34 |
+
.codehilite .nc { color: #0000FF; font-weight: bold } /* Name.Class */
|
| 35 |
+
.codehilite .no { color: #880000 } /* Name.Constant */
|
| 36 |
+
.codehilite .nd { color: #AA22FF } /* Name.Decorator */
|
| 37 |
+
.codehilite .ni { color: #999999; font-weight: bold } /* Name.Entity */
|
| 38 |
+
.codehilite .ne { color: #D2413A; font-weight: bold } /* Name.Exception */
|
| 39 |
+
.codehilite .nf { color: #0000FF } /* Name.Function */
|
| 40 |
+
.codehilite .nl { color: #A0A000 } /* Name.Label */
|
| 41 |
+
.codehilite .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
|
| 42 |
+
.codehilite .nt { color: #008000; font-weight: bold } /* Name.Tag */
|
| 43 |
+
.codehilite .nv { color: #19177C } /* Name.Variable */
|
| 44 |
+
.codehilite .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
|
| 45 |
+
.codehilite .w { color: #bbbbbb } /* Text.Whitespace */
|
| 46 |
+
.codehilite .mb { color: #666666 } /* Literal.Number.Bin */
|
| 47 |
+
.codehilite .mf { color: #666666 } /* Literal.Number.Float */
|
| 48 |
+
.codehilite .mh { color: #666666 } /* Literal.Number.Hex */
|
| 49 |
+
.codehilite .mi { color: #666666 } /* Literal.Number.Integer */
|
| 50 |
+
.codehilite .mo { color: #666666 } /* Literal.Number.Oct */
|
| 51 |
+
.codehilite .sa { color: #BA2121 } /* Literal.String.Affix */
|
| 52 |
+
.codehilite .sb { color: #BA2121 } /* Literal.String.Backtick */
|
| 53 |
+
.codehilite .sc { color: #BA2121 } /* Literal.String.Char */
|
| 54 |
+
.codehilite .dl { color: #BA2121 } /* Literal.String.Delimiter */
|
| 55 |
+
.codehilite .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
|
| 56 |
+
.codehilite .s2 { color: #BA2121 } /* Literal.String.Double */
|
| 57 |
+
.codehilite .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
|
| 58 |
+
.codehilite .sh { color: #BA2121 } /* Literal.String.Heredoc */
|
| 59 |
+
.codehilite .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
|
| 60 |
+
.codehilite .sx { color: #008000 } /* Literal.String.Other */
|
| 61 |
+
.codehilite .sr { color: #BB6688 } /* Literal.String.Regex */
|
| 62 |
+
.codehilite .s1 { color: #BA2121 } /* Literal.String.Single */
|
| 63 |
+
.codehilite .ss { color: #19177C } /* Literal.String.Symbol */
|
| 64 |
+
.codehilite .bp { color: #008000 } /* Name.Builtin.Pseudo */
|
| 65 |
+
.codehilite .fm { color: #0000FF } /* Name.Function.Magic */
|
| 66 |
+
.codehilite .vc { color: #19177C } /* Name.Variable.Class */
|
| 67 |
+
.codehilite .vg { color: #19177C } /* Name.Variable.Global */
|
| 68 |
+
.codehilite .vi { color: #19177C } /* Name.Variable.Instance */
|
| 69 |
+
.codehilite .vm { color: #19177C } /* Name.Variable.Magic */
|
| 70 |
+
.codehilite .il { color: #666666 } /* Literal.Number.Integer.Long */
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
.project_cover {
|
| 74 |
+
display: flex;
|
| 75 |
+
flex-direction: column;
|
| 76 |
+
justify-content: center;
|
| 77 |
+
align-items: center;
|
| 78 |
+
min-height: 650px;
|
| 79 |
+
border: 1px solid rgba(229, 231, 235, 0.6); /* 在边框中添加一点透明度 */
|
| 80 |
+
border-radius: 16px; /* 增加边框圆角 */
|
| 81 |
+
padding: 40px; /* 增加内部间距 */
|
| 82 |
+
background-color: #ffffff; /* 添加背景颜色 */
|
| 83 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); /* 添加轻微的阴影效果 */
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
.project_img {
|
| 87 |
+
overflow: hidden;
|
| 88 |
+
position: center;
|
| 89 |
+
display: flex;
|
| 90 |
+
justify-content: center;
|
| 91 |
+
align-items: center;
|
| 92 |
+
margin-bottom: auto;
|
| 93 |
+
/* box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15), 0 6px 20px rgba(0, 0, 0, 0.5); */
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
.project_img img {
|
| 97 |
+
width: 80%;
|
| 98 |
+
height: 80%;
|
| 99 |
+
}
|
| 100 |
+
.show_image {
|
| 101 |
+
justify-content: center;
|
| 102 |
+
align-items: center;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
.show_image img {
|
| 106 |
+
width: 50%;
|
| 107 |
+
height: 50%;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
.project_label {
|
| 113 |
+
font-size: 18px; /* 标题字体大小 */
|
| 114 |
+
color: #333; /* 字体颜色,这里使用深灰色 */
|
| 115 |
+
font-weight: bold; /* 字体加粗 */
|
| 116 |
+
text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1); /* 文字阴影 */
|
| 117 |
+
transition: all 0.3s ease; /* 平滑过渡动画 */
|
| 118 |
+
padding: 10px; /* 内填充 */
|
| 119 |
+
margin-bottom: 20px; /* 底部外边距 */
|
| 120 |
+
border-bottom: 2px solid #ddd; /* 底部边框样式 */
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
.project_name {
|
| 124 |
+
font-size: 30px; /* 调整字体大小 */
|
| 125 |
+
color: #333333; /* 字体颜色更深,增加对比度 */
|
| 126 |
+
margin-top: 20px; /* 调整名称顶部的间距 */
|
| 127 |
+
/* font-weight: bold; 字体加粗 */
|
| 128 |
+
/* text-transform: uppercase; 文字大写 */
|
| 129 |
+
align-items: center;
|
| 130 |
+
justify-content: center;
|
| 131 |
+
text-align: center; /* 文字居中 */
|
| 132 |
+
letter-spacing: 1.5px; /* 增加字母间距 */
|
| 133 |
+
transition: all 0.3s ease; /* 平滑过渡动画 */
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.project_desc {
|
| 137 |
+
color: #444444; /* 字体颜色更深 */
|
| 138 |
+
font-size: 18px; /* 增加字体大小 */
|
| 139 |
+
margin: 20px 0; /* 增加上下间距 */
|
| 140 |
+
text-align: center; /* 文字居中 */
|
| 141 |
+
line-height: 1.5; /* 增加行高,提升可读性 */
|
| 142 |
+
transition: all 0.3s ease; /* 平滑过渡动画 */
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
.markdown-body .message {
|
| 146 |
+
white-space: pre-wrap;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
.markdown-body details {
|
| 150 |
+
white-space: nowrap;
|
| 151 |
+
}
|
| 152 |
+
.markdown-body .bot details:not(:last-child) {
|
| 153 |
+
margin-bottom: 1px;
|
| 154 |
+
}
|
| 155 |
+
.markdown-body summary {
|
| 156 |
+
background-color: #4b5563;
|
| 157 |
+
color: #eee;
|
| 158 |
+
padding: 0 4px;
|
| 159 |
+
border-radius: 4px;
|
| 160 |
+
font-size: 0.9em;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
.project_intro {
|
| 165 |
+
display: grid;
|
| 166 |
+
place-items: center; /* 完美居中 */
|
| 167 |
+
height: 100px; /* 高度 */
|
| 168 |
+
width:
|
| 169 |
+
font-size: 15px; /* 正文字体大小 */
|
| 170 |
+
/* text-align: center; 文字居中 */
|
| 171 |
+
color: #555; /* 正文字体颜色,这里使用较浅的灰色 */
|
| 172 |
+
border-radius: 8px; /* 边框圆角 */
|
| 173 |
+
transition: transform 0.3s ease; /* 平滑过渡动画 */
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
/* 鼠标悬停时的动画效果 */
|
| 177 |
+
.project_desc:hover,
|
| 178 |
+
.project_name:hover,
|
| 179 |
+
.project_label:hover,
|
| 180 |
+
.project_intro:hover {
|
| 181 |
+
transform: translateY(-5px); /* 向上移动 */
|
| 182 |
+
}
|
app.py
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import inspect
|
| 3 |
+
import base64
|
| 4 |
+
import yaml
|
| 5 |
+
import copy
|
| 6 |
+
import shutil
|
| 7 |
+
import gradio as gr
|
| 8 |
+
from data_juicer.ops.base_op import OPERATORS
|
| 9 |
+
from data_juicer.utils.constant import Fields
|
| 10 |
+
demo_path = os.path.dirname(os.path.abspath(__file__))
|
| 11 |
+
project_path = os.path.dirname(os.path.dirname(demo_path))
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# 图片本地路径转换为 base64 格式
|
| 15 |
+
def covert_image_to_base64(image_path):
|
| 16 |
+
# 获得文件后缀名
|
| 17 |
+
ext = image_path.split(".")[-1]
|
| 18 |
+
if ext not in ["gif", "jpeg", "png"]:
|
| 19 |
+
ext = "jpeg"
|
| 20 |
+
|
| 21 |
+
with open(image_path, "rb") as image_file:
|
| 22 |
+
# Read the file
|
| 23 |
+
encoded_string = base64.b64encode(image_file.read())
|
| 24 |
+
|
| 25 |
+
# Convert bytes to string
|
| 26 |
+
base64_data = encoded_string.decode("utf-8")
|
| 27 |
+
|
| 28 |
+
# 生成base64编码的地址
|
| 29 |
+
base64_url = f"data:image/{ext};base64,{base64_data}"
|
| 30 |
+
return base64_url
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def format_cover_html(project_img_path):
|
| 34 |
+
readme_link = 'https://github.com/alibaba/data-juicer'
|
| 35 |
+
config = {
|
| 36 |
+
'name': "Data-Juicer",
|
| 37 |
+
'label': "Op Insight",
|
| 38 |
+
'description': f'A One-Stop Data Processing System for Large Language Models.',
|
| 39 |
+
'introduction':
|
| 40 |
+
"This project is being actively updated and maintained, and we will periodically enhance and add more features and data recipes. <br>"
|
| 41 |
+
"We welcome you to join us in promoting LLM data development and research!<br>",
|
| 42 |
+
'demo':"You can experience the effect of the operators of Data-Juicer",
|
| 43 |
+
'note':'Note: Due to resource limitations, only a subset of operations is available here. see more details in <a href="{readme_link}">GitHub</a>'
|
| 44 |
+
}
|
| 45 |
+
# image_src = covert_image_to_base64(project_img_path)
|
| 46 |
+
# <div class="project_img"> <img src={image_src} /> </div>
|
| 47 |
+
# <div class='project_cover'>
|
| 48 |
+
return f"""
|
| 49 |
+
<div>
|
| 50 |
+
<div class="project_name">{config.get("name", "")} </div>
|
| 51 |
+
<div class="project_desc">{config.get("description", "")}</div>
|
| 52 |
+
<div class="project_desc">{config.get("introduction", "")}</div>
|
| 53 |
+
<div class="project_desc">{config.get("demo", "")}</div>
|
| 54 |
+
<div class="project_desc">{config.get("note", "")}</div>
|
| 55 |
+
</div>
|
| 56 |
+
"""
|
| 57 |
+
op_text = ''
|
| 58 |
+
docs_file = os.path.join(project_path, 'docs/Operators.md')
|
| 59 |
+
if os.path.exists(docs_file):
|
| 60 |
+
with open(os.path.join(project_path, 'docs/Operators.md'), 'r') as f:
|
| 61 |
+
op_text = f.read()
|
| 62 |
+
|
| 63 |
+
def extract_op_desc(markdown_text, header):
|
| 64 |
+
start_index = markdown_text.find(header)
|
| 65 |
+
end_index = markdown_text.find("\n##", start_index + len(header))
|
| 66 |
+
return markdown_text[start_index+ len(header):end_index].strip()
|
| 67 |
+
|
| 68 |
+
op_desc = f"<div style='text-align: center;'>{extract_op_desc(op_text, '## Overview').split('All the specific ')[0].strip()}</div>"
|
| 69 |
+
op_list_desc = {
|
| 70 |
+
'mapper':extract_op_desc(op_text, '## Mapper <a name="mapper"/>'),
|
| 71 |
+
'filter':extract_op_desc(op_text, '## Filter <a name="filter"/>'),
|
| 72 |
+
'deduplicator':extract_op_desc(op_text, '## Deduplicator <a name="deduplicator"/>'),
|
| 73 |
+
'selector':extract_op_desc(op_text, '## Selector <a name="selector"/>'),
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
op_types = ['mapper', 'filter',]# 'deduplicator'] , 'selector']
|
| 77 |
+
local_ops_dict = {op_type:[] for op_type in op_types}
|
| 78 |
+
multimodal = os.getenv('MULTI_MODAL', True)
|
| 79 |
+
multimodal_visible = False
|
| 80 |
+
text_key = 'text'
|
| 81 |
+
image_key = 'images'
|
| 82 |
+
audio_key = 'audios'
|
| 83 |
+
video_key = 'videos'
|
| 84 |
+
|
| 85 |
+
def get_op_lists(op_type):
|
| 86 |
+
use_local_op = os.getenv('USE_LOCAL_OP', False)
|
| 87 |
+
if not use_local_op:
|
| 88 |
+
all_ops = list(OPERATORS.modules.keys())
|
| 89 |
+
options = [
|
| 90 |
+
name for name in all_ops if name.endswith(op_type)
|
| 91 |
+
]
|
| 92 |
+
else:
|
| 93 |
+
options = local_ops_dict.get(op_type, [])
|
| 94 |
+
|
| 95 |
+
for exclude in ['image', 'video', 'audio']:
|
| 96 |
+
options = [name for name in options if multimodal or exclude not in name]
|
| 97 |
+
return options
|
| 98 |
+
|
| 99 |
+
def show_code(op_name):
|
| 100 |
+
op_class = OPERATORS.modules[op_name]
|
| 101 |
+
text = inspect.getsourcelines(op_class)
|
| 102 |
+
|
| 103 |
+
init_signature = inspect.signature(op_class.__init__)
|
| 104 |
+
|
| 105 |
+
# 输出每个参数的名字和默认值
|
| 106 |
+
default_params = dict()
|
| 107 |
+
for name, parameter in init_signature.parameters.items():
|
| 108 |
+
if name in ['self', 'args', 'kwargs']:
|
| 109 |
+
continue # 跳过 'self' 参数
|
| 110 |
+
if parameter.default is not inspect.Parameter.empty:
|
| 111 |
+
default_params[name] = parameter.default
|
| 112 |
+
|
| 113 |
+
return ''.join(text[0]), yaml.dump(default_params)
|
| 114 |
+
|
| 115 |
+
def change_visible(op_name):
|
| 116 |
+
text_visible = True
|
| 117 |
+
video_visible = False
|
| 118 |
+
audio_visible = False
|
| 119 |
+
image_visible = False
|
| 120 |
+
if 'video' in op_name:
|
| 121 |
+
video_visible = True
|
| 122 |
+
elif 'audio' in op_name:
|
| 123 |
+
audio_visible = True
|
| 124 |
+
elif 'image' in op_name:
|
| 125 |
+
image_visible = True
|
| 126 |
+
return gr.update(visible=text_visible), gr.update(visible=image_visible), gr.update(visible=video_visible), gr.update(visible=audio_visible), gr.update(visible=text_visible), gr.update(visible=image_visible), gr.update(visible=video_visible), gr.update(visible=audio_visible)
|
| 127 |
+
|
| 128 |
+
def copy_func(file):
|
| 129 |
+
filename = None
|
| 130 |
+
if file:
|
| 131 |
+
filename= os.path.basename(file)
|
| 132 |
+
shutil.copyfile(file, filename)
|
| 133 |
+
return filename
|
| 134 |
+
|
| 135 |
+
def encode_sample(input_text, input_image, input_video, input_audio):
|
| 136 |
+
sample = dict()
|
| 137 |
+
sample[text_key]=input_text
|
| 138 |
+
sample[image_key]= [input_image] if input_image else []
|
| 139 |
+
sample[video_key]=[input_video] if input_video else []
|
| 140 |
+
sample[audio_key]=[input_audio] if input_audio else []
|
| 141 |
+
return sample
|
| 142 |
+
|
| 143 |
+
def decode_sample(output_sample):
|
| 144 |
+
output_text = output_sample[text_key]
|
| 145 |
+
output_image = output_sample[image_key][0] if output_sample[image_key] else None
|
| 146 |
+
output_video = output_sample[video_key][0] if output_sample[video_key] else None
|
| 147 |
+
output_audio = output_sample[audio_key][0] if output_sample[audio_key] else None
|
| 148 |
+
image_file = copy_func(output_image)
|
| 149 |
+
video_file = copy_func(output_video)
|
| 150 |
+
audio_file = copy_func(output_audio)
|
| 151 |
+
return output_text, image_file, video_file, audio_file
|
| 152 |
+
|
| 153 |
+
def create_tab_layout(op_tab, op_type, run_op, has_stats=False):
|
| 154 |
+
with op_tab:
|
| 155 |
+
options = get_op_lists(op_type)
|
| 156 |
+
label = f'Select a {op_type} to show details'
|
| 157 |
+
with gr.Row():
|
| 158 |
+
op_selector = gr.Dropdown(value=options[0], label=label, choices=options, interactive=True)
|
| 159 |
+
with gr.Column():
|
| 160 |
+
gr.Markdown(" **Op Parameters**")
|
| 161 |
+
op_params = gr.Code(label="Yaml",language='yaml', interactive=True)
|
| 162 |
+
run_button = gr.Button(value="🚀Run")
|
| 163 |
+
show_code_button = gr.Button(value="🔍Show Code")
|
| 164 |
+
|
| 165 |
+
with gr.Column():
|
| 166 |
+
with gr.Group('Inputs'):
|
| 167 |
+
gr.Markdown(" **Inputs**")
|
| 168 |
+
with gr.Row():
|
| 169 |
+
input_text = gr.TextArea(label="Text",interactive=True,)
|
| 170 |
+
input_image = gr.Image(label='Image', type='filepath', visible=multimodal_visible, elem_classes="show_image")
|
| 171 |
+
input_video = gr.Video(label='Video', visible=multimodal_visible)
|
| 172 |
+
input_audio = gr.Audio(label='Audio', type='filepath', visible=multimodal_visible)
|
| 173 |
+
|
| 174 |
+
with gr.Group('Outputs'):
|
| 175 |
+
gr.Markdown(" **Outputs**")
|
| 176 |
+
with gr.Row():
|
| 177 |
+
output_text = gr.TextArea(label="Text",interactive=False,)
|
| 178 |
+
output_image = gr.Image(label='Image', type='filepath', visible=multimodal_visible, elem_classes="show_image")
|
| 179 |
+
output_video = gr.Video(label='Video', visible=multimodal_visible)
|
| 180 |
+
output_audio = gr.Audio(label='Audio', type='filepath', visible=multimodal_visible)
|
| 181 |
+
|
| 182 |
+
with gr.Row():
|
| 183 |
+
if has_stats:
|
| 184 |
+
output_stats = gr.Json(label='Stats')
|
| 185 |
+
output_keep = gr.Text(label='Keep or not?', interactive=False)
|
| 186 |
+
|
| 187 |
+
code = gr.Code(label='Source', language='python')
|
| 188 |
+
inputs = [input_text, input_image, input_video, input_audio, op_selector, op_params]
|
| 189 |
+
outputs = [output_text, output_image, output_video, output_audio]
|
| 190 |
+
if has_stats:
|
| 191 |
+
outputs.append(output_stats)
|
| 192 |
+
outputs.append(output_keep)
|
| 193 |
+
|
| 194 |
+
def run_func(*args):
|
| 195 |
+
try:
|
| 196 |
+
try:
|
| 197 |
+
args = list(args)
|
| 198 |
+
op_params = args.pop()
|
| 199 |
+
params = yaml.safe_load(op_params)
|
| 200 |
+
except:
|
| 201 |
+
params = {}
|
| 202 |
+
if params is None:
|
| 203 |
+
params = {}
|
| 204 |
+
return run_op(*args, params)
|
| 205 |
+
except Exception as e:
|
| 206 |
+
gr.Error(str(e))
|
| 207 |
+
print(e)
|
| 208 |
+
return outputs
|
| 209 |
+
|
| 210 |
+
show_code_button.click(show_code, inputs=[op_selector], outputs=[code, op_params])
|
| 211 |
+
show_code_button.click(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
|
| 212 |
+
run_button.click(run_func, inputs=inputs, outputs=outputs)
|
| 213 |
+
run_button.click(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
|
| 214 |
+
op_selector.select(show_code, inputs=[op_selector], outputs=[code, op_params])
|
| 215 |
+
op_selector.select(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
|
| 216 |
+
op_tab.select(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
|
| 217 |
+
|
| 218 |
+
def create_mapper_tab(op_type, op_tab):
|
| 219 |
+
with op_tab:
|
| 220 |
+
def run_op(input_text, input_image, input_video, input_audio, op_name, op_params):
|
| 221 |
+
op_class = OPERATORS.modules[op_name]
|
| 222 |
+
op = op_class(**op_params)
|
| 223 |
+
sample = encode_sample(input_text, input_image, input_video, input_audio)
|
| 224 |
+
output_sample = op.process(copy.deepcopy(sample))
|
| 225 |
+
return decode_sample(output_sample)
|
| 226 |
+
create_tab_layout(op_tab, op_type, run_op)
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def create_filter_tab(op_type, op_tab):
|
| 230 |
+
def run_op(input_text, input_image, input_video, input_audio, op_name, op_params):
|
| 231 |
+
op_class = OPERATORS.modules[op_name]
|
| 232 |
+
op = op_class(**op_params)
|
| 233 |
+
sample = encode_sample(input_text, input_image, input_video, input_audio)
|
| 234 |
+
sample[Fields.stats] = dict()
|
| 235 |
+
output_sample = op.compute_stats(copy.deepcopy(sample))
|
| 236 |
+
if op.process(output_sample):
|
| 237 |
+
output_keep = 'Yes'
|
| 238 |
+
else:
|
| 239 |
+
output_keep = 'No'
|
| 240 |
+
output_stats = output_sample[Fields.stats]
|
| 241 |
+
return *decode_sample(output_sample), output_stats, output_keep
|
| 242 |
+
create_tab_layout(op_tab, op_type, run_op, has_stats=True)
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def create_deduplicator_tab(op_type, op_tab):
|
| 246 |
+
with op_tab:
|
| 247 |
+
def run_op( input_text, input_image, input_video, input_audio, op_name, op_params):
|
| 248 |
+
op_class = OPERATORS.modules[op_name]
|
| 249 |
+
op = op_class(**op_params)
|
| 250 |
+
sample = encode_sample(input_text, input_image, input_video, input_audio)
|
| 251 |
+
output_sample = sample #op.compute_hash(copy.deepcopy(sample))
|
| 252 |
+
return decode_sample(output_sample)
|
| 253 |
+
create_tab_layout(op_tab, op_type, run_op, has_stats=True)
|
| 254 |
+
|
| 255 |
+
def create_tab_double_layout(op_tab, op_type, run_op):
|
| 256 |
+
with op_tab:
|
| 257 |
+
options = get_op_lists(op_type)
|
| 258 |
+
label = f'Select a {op_type} to show details'
|
| 259 |
+
with gr.Row():
|
| 260 |
+
op_selector = gr.Dropdown(value=options[0], label=label, choices=options, interactive=True)
|
| 261 |
+
with gr.Column():
|
| 262 |
+
gr.Markdown(" **Op Parameters**")
|
| 263 |
+
op_params = gr.Code(label="Yaml",language='yaml', interactive=True)
|
| 264 |
+
run_button = gr.Button(value="🚀Run")
|
| 265 |
+
show_code_button = gr.Button(value="🔍Show Code")
|
| 266 |
+
|
| 267 |
+
with gr.Column():
|
| 268 |
+
with gr.Group('Inputs'):
|
| 269 |
+
gr.Markdown(" **Inputs**")
|
| 270 |
+
with gr.Row():
|
| 271 |
+
input_text = gr.TextArea(label="Text",interactive=True,)
|
| 272 |
+
input_text2 = gr.TextArea(label="Text",interactive=True,)
|
| 273 |
+
input_image = gr.Image(label='Image', type='filepath', visible=multimodal_visible, elem_classes="show_image")
|
| 274 |
+
input_image2 = gr.Image(label='Image', type='filepath', visible=multimodal_visible, elem_classes="show_image")
|
| 275 |
+
input_video = gr.Video(label='Video', visible=multimodal_visible)
|
| 276 |
+
input_video2 = gr.Video(label='Video', visible=multimodal_visible)
|
| 277 |
+
input_audio = gr.Audio(label='Audio', type='filepath', visible=multimodal_visible)
|
| 278 |
+
input_audio2 = gr.Audio(label='Audio', type='filepath', visible=multimodal_visible)
|
| 279 |
+
|
| 280 |
+
with gr.Group('Outputs'):
|
| 281 |
+
gr.Markdown(" **Outputs**")
|
| 282 |
+
with gr.Row():
|
| 283 |
+
output_text = gr.TextArea(label="Text",interactive=False,)
|
| 284 |
+
output_text2 = gr.TextArea(label="Text",interactive=False,)
|
| 285 |
+
output_image = gr.Image(label='Image', type='filepath', visible=multimodal_visible, elem_classes="show_image")
|
| 286 |
+
output_image2 = gr.Image(label='Image', type='filepath', visible=multimodal_visible, elem_classes="show_image")
|
| 287 |
+
output_video = gr.Video(label='Video', visible=multimodal_visible)
|
| 288 |
+
output_video2 = gr.Video(label='Video', visible=multimodal_visible)
|
| 289 |
+
output_audio = gr.Audio(label='Audio', type='filepath', visible=multimodal_visible)
|
| 290 |
+
output_audio2 = gr.Audio(label='Audio', type='filepath', visible=multimodal_visible)
|
| 291 |
+
|
| 292 |
+
code = gr.Code(label='Source', language='python')
|
| 293 |
+
inputs = [input_text, input_image, input_video, input_audio, input_text2, input_image2, input_video2, input_audio2, op_selector, op_params]
|
| 294 |
+
outputs = [output_text, output_image, output_video, output_audio, output_text2, output_image2, output_video2, output_audio2]
|
| 295 |
+
|
| 296 |
+
def run_func(*args):
|
| 297 |
+
try:
|
| 298 |
+
try:
|
| 299 |
+
op_params = args[-1]
|
| 300 |
+
params = yaml.safe_load(op_params)
|
| 301 |
+
except:
|
| 302 |
+
params = {}
|
| 303 |
+
if params is None:
|
| 304 |
+
params = {}
|
| 305 |
+
return run_op(input_text, input_image, input_video, input_audio, op_selector, params)
|
| 306 |
+
except Exception as e:
|
| 307 |
+
gr.Error(str(e))
|
| 308 |
+
return outputs
|
| 309 |
+
|
| 310 |
+
# show_code_button.click(show_code, inputs=[op_selector], outputs=[code, op_params])
|
| 311 |
+
# show_code_button.click(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
|
| 312 |
+
# run_button.click(run_func, inputs=inputs, outputs=outputs)
|
| 313 |
+
# op_selector.select(show_code, inputs=[op_selector], outputs=[code, op_params])
|
| 314 |
+
# op_selector.select(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
|
| 315 |
+
show_code_button.click(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4]).then(show_code, inputs=[op_selector], outputs=[code, op_params])
|
| 316 |
+
run_button.click(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4]).then(run_func, inputs=[op_selector], outputs=[code, op_params])
|
| 317 |
+
op_selector.select(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4]).then(show_code, inputs=[op_selector], outputs=[code, op_params])
|
| 318 |
+
op_tab.select(change_visible, inputs=[op_selector], outputs=outputs[:4] + inputs[:4])
|
| 319 |
+
with gr.Blocks(css="./app.css") as demo:
|
| 320 |
+
|
| 321 |
+
dj_image = os.path.join(project_path, 'docs/imgs/data-juicer.jpg')
|
| 322 |
+
gr.HTML(format_cover_html(dj_image))
|
| 323 |
+
|
| 324 |
+
with gr.Accordion(label='Op Insight',open=True):
|
| 325 |
+
tabs = gr.Tabs()
|
| 326 |
+
with tabs:
|
| 327 |
+
op_tabs = {op_type: gr.Tab(label=op_type.capitalize() + 's') for op_type in op_types}
|
| 328 |
+
for op_type, op_tab in op_tabs.items():
|
| 329 |
+
create_op_tab_func = globals().get(f'create_{op_type}_tab', None)
|
| 330 |
+
if callable(create_op_tab_func):
|
| 331 |
+
create_op_tab_func(op_type, op_tab)
|
| 332 |
+
else:
|
| 333 |
+
gr.Error(f'{op_type} not callable')
|
| 334 |
+
|
| 335 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pydantic>=2
|
| 2 |
+
git+https://gh-proxy.com/https://github.com/alibaba/data-juicer.git@demos/op_insight_slight#egg=py-data-juicer[all]
|