Spaces:
Build error
Weaviate schema based on user input and csv upload
Browse filesUI Input for Class and Description:
We'll add an input field in the Streamlit app where users can define the class name and description of the CSV they're uploading.
Auto-Populate Schema from CSV:
Once the CSV is uploaded, we'll read its headers to determine the column names. We can then use simple heuristics to determine the data type of each column (e.g., if a column contains only numbers, it's likely a float or int, if it matches date patterns, it's a date, otherwise, it's a string). We'll also use the column names as descriptions for simplicity, but this can be enhanced further if needed.
Create Schema in Weaviate:
Using the class name, description, and the auto-populated properties, we'll define the schema and create it in Weaviate.
Ingest Data:
Once the schema is created, we can then ingest the data from the CSV into Weaviate.
|
@@ -16,20 +16,46 @@ client = weaviate.Client(
|
|
| 16 |
embedded_options=EmbeddedOptions()
|
| 17 |
)
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
for
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
obj = {
|
| 23 |
-
"class":
|
| 24 |
-
"id": str(index),
|
| 25 |
"properties": row.to_dict()
|
| 26 |
}
|
| 27 |
-
|
|
|
|
| 28 |
|
| 29 |
-
# Function to query data from Weaviate
|
| 30 |
def query_weaviate(question):
|
| 31 |
# This is a basic example; adapt the query based on the question
|
| 32 |
-
results = client.query.get(
|
| 33 |
return results
|
| 34 |
|
| 35 |
def ask_llm_chunk(chunk, questions):
|
|
@@ -89,20 +115,25 @@ def summarize_map_reduce(data, questions):
|
|
| 89 |
all_answers.extend(chunk_answers)
|
| 90 |
return all_answers
|
| 91 |
|
| 92 |
-
st.title("TAPAS Table Question Answering with Weaviate")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
# Upload CSV data
|
| 95 |
csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
|
| 96 |
if csv_file is not None:
|
| 97 |
data = csv_file.read().decode("utf-8")
|
| 98 |
dataframe = pd.read_csv(StringIO(data))
|
| 99 |
-
|
| 100 |
-
# Ingest data into Weaviate
|
| 101 |
-
ingest_data_to_weaviate(dataframe)
|
| 102 |
-
|
| 103 |
st.write("CSV Data Preview:")
|
| 104 |
st.write(dataframe.head())
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
# Input for questions
|
| 107 |
questions = st.text_area("Enter your questions (one per line)")
|
| 108 |
questions = questions.split("\n") # split questions by line
|
|
@@ -110,14 +141,7 @@ if csv_file is not None:
|
|
| 110 |
|
| 111 |
if st.button("Submit"):
|
| 112 |
if data and questions:
|
| 113 |
-
|
| 114 |
-
relevant_data = query_weaviate(questions[0]) # Example: using the first question
|
| 115 |
-
# Convert the relevant data to a DataFrame (you might need to adjust this based on the Weaviate response format)
|
| 116 |
-
relevant_df = pd.DataFrame(relevant_data)
|
| 117 |
-
|
| 118 |
-
# Pass the relevant data to TAPAS
|
| 119 |
-
answers = summarize_map_reduce(relevant_df, questions)
|
| 120 |
-
|
| 121 |
st.write("Answers:")
|
| 122 |
for q, a in zip(questions, answers):
|
| 123 |
st.write(f"Question: {q}")
|
|
|
|
| 16 |
embedded_options=EmbeddedOptions()
|
| 17 |
)
|
| 18 |
|
| 19 |
+
def ingest_data_to_weaviate(dataframe, class_name, class_description):
|
| 20 |
+
properties = []
|
| 21 |
+
for column in dataframe.columns:
|
| 22 |
+
data_type = "string"
|
| 23 |
+
if dataframe[column].dtype == "float64":
|
| 24 |
+
data_type = "float"
|
| 25 |
+
elif dataframe[column].dtype == "int64":
|
| 26 |
+
data_type = "int"
|
| 27 |
+
properties.append({
|
| 28 |
+
"name": column,
|
| 29 |
+
"description": column,
|
| 30 |
+
"dataType": [data_type]
|
| 31 |
+
})
|
| 32 |
+
|
| 33 |
+
schema = {
|
| 34 |
+
"classes": [
|
| 35 |
+
{
|
| 36 |
+
"class": class_name,
|
| 37 |
+
"description": class_description,
|
| 38 |
+
"properties": properties
|
| 39 |
+
}
|
| 40 |
+
]
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# Create Schema in Weaviate
|
| 44 |
+
client.schema.create(schema)
|
| 45 |
+
|
| 46 |
+
# Ingest Data
|
| 47 |
+
batch_request = weaviate.ObjectsBatchRequest()
|
| 48 |
+
for _, row in dataframe.iterrows():
|
| 49 |
obj = {
|
| 50 |
+
"class": class_name,
|
|
|
|
| 51 |
"properties": row.to_dict()
|
| 52 |
}
|
| 53 |
+
batch_request.add(obj)
|
| 54 |
+
client.batch.create(batch_request)
|
| 55 |
|
|
|
|
| 56 |
def query_weaviate(question):
|
| 57 |
# This is a basic example; adapt the query based on the question
|
| 58 |
+
results = client.query.get(class_name).with_near_text(question).do()
|
| 59 |
return results
|
| 60 |
|
| 61 |
def ask_llm_chunk(chunk, questions):
|
|
|
|
| 115 |
all_answers.extend(chunk_answers)
|
| 116 |
return all_answers
|
| 117 |
|
| 118 |
+
st.title("TAPAS Table Question Answering with Weaviate Integration")
|
| 119 |
+
|
| 120 |
+
# UI Input for Class and Description
|
| 121 |
+
class_name = st.text_input("Enter the class name for your CSV data:")
|
| 122 |
+
class_description = st.text_input("Enter a description for your class:")
|
| 123 |
|
| 124 |
# Upload CSV data
|
| 125 |
csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
|
| 126 |
if csv_file is not None:
|
| 127 |
data = csv_file.read().decode("utf-8")
|
| 128 |
dataframe = pd.read_csv(StringIO(data))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
st.write("CSV Data Preview:")
|
| 130 |
st.write(dataframe.head())
|
| 131 |
|
| 132 |
+
# Ingest data to Weaviate
|
| 133 |
+
if st.button("Ingest to Weaviate"):
|
| 134 |
+
ingest_data_to_weaviate(dataframe, class_name, class_description)
|
| 135 |
+
st.write("Data ingested successfully!")
|
| 136 |
+
|
| 137 |
# Input for questions
|
| 138 |
questions = st.text_area("Enter your questions (one per line)")
|
| 139 |
questions = questions.split("\n") # split questions by line
|
|
|
|
| 141 |
|
| 142 |
if st.button("Submit"):
|
| 143 |
if data and questions:
|
| 144 |
+
answers = summarize_map_reduce(data, questions)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
st.write("Answers:")
|
| 146 |
for q, a in zip(questions, answers):
|
| 147 |
st.write(f"Question: {q}")
|