from copy import deepcopy import streamlit as st import pandas as pd from io import StringIO from transformers import AutoTokenizer, AutoModelForTableQuestionAnswering import numpy as np import weaviate from weaviate.embedded import EmbeddedOptions from weaviate import Client, ObjectsBatchRequest # Initialize TAPAS model and tokenizer tokenizer = AutoTokenizer.from_pretrained("google/tapas-large-finetuned-wtq") model = AutoModelForTableQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq") # Initialize Weaviate client for the embedded instance client = weaviate.Client( embedded_options=EmbeddedOptions() ) def ingest_data_to_weaviate(dataframe, class_name, class_description): properties = [] for column in dataframe.columns: data_type = "string" if dataframe[column].dtype == "float64": data_type = "float" elif dataframe[column].dtype == "int64": data_type = "int" properties.append({ "name": column, "description": column, "dataType": [data_type] }) schema = { "classes": [ { "class": class_name, "description": class_description, "properties": properties } ] } # Create Schema in Weaviate client.schema.create(schema) # Ingest Data batch_request = weaviate.ObjectsBatchRequest() for _, row in dataframe.iterrows(): obj = { "class": class_name, "properties": row.to_dict() } batch_request.add(obj) client.batch.create(batch_request) def query_weaviate(question): # This is a basic example; adapt the query based on the question results = client.query.get(class_name).with_near_text(question).do() return results def ask_llm_chunk(chunk, questions): chunk = chunk.astype(str) try: inputs = tokenizer(table=chunk, queries=questions, padding="max_length", truncation=True, return_tensors="pt") except Exception as e: st.write(f"An error occurred: {e}") return ["Error occurred while tokenizing"] * len(questions) if inputs["input_ids"].shape[1] > 512: st.warning("Token limit exceeded for chunk") return ["Token limit exceeded for chunk"] * len(questions) outputs = model(**inputs) predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( inputs, outputs.logits.detach(), outputs.logits_aggregation.detach() ) answers = [] for coordinates in predicted_answer_coordinates: if len(coordinates) == 1: row, col = coordinates[0] try: st.write(f"DataFrame shape: {chunk.shape}") # Debugging line st.write(f"DataFrame columns: {chunk.columns}") # Debugging line st.write(f"Trying to access row {row}, col {col}") # Debugging line value = chunk.iloc[row, col] st.write(f"Value accessed: {value}") # Debugging line answers.append(value) except Exception as e: st.write(f"An error occurred: {e}") else: cell_values = [] for coordinate in coordinates: row, col = coordinate try: value = chunk.iloc[row, col] cell_values.append(value) except Exception as e: st.write(f"An error occurred: {e}") answers.append(", ".join(map(str, cell_values))) return answers MAX_ROWS_PER_CHUNK = 200 def summarize_map_reduce(data, questions): dataframe = pd.read_csv(StringIO(data)) num_chunks = len(dataframe) // MAX_ROWS_PER_CHUNK + 1 dataframe_chunks = [deepcopy(chunk) for chunk in np.array_split(dataframe, num_chunks)] all_answers = [] for chunk in dataframe_chunks: chunk_answers = ask_llm_chunk(chunk, questions) all_answers.extend(chunk_answers) return all_answers st.title("TAPAS Table Question Answering with Weaviate Integration") # UI Input for Class and Description class_name = st.text_input("Enter the class name for your CSV data:") class_description = st.text_input("Enter a description for your class:") # Upload CSV data csv_file = st.file_uploader("Upload a CSV file", type=["csv"]) if csv_file is not None: data = csv_file.read().decode("utf-8") dataframe = pd.read_csv(StringIO(data)) st.write("CSV Data Preview:") st.write(dataframe.head()) # Ingest data to Weaviate if st.button("Ingest to Weaviate"): ingest_data_to_weaviate(dataframe, class_name, class_description) st.write("Data ingested successfully!") # Input for questions questions = st.text_area("Enter your questions (one per line)") questions = questions.split("\n") # split questions by line questions = [q for q in questions if q] # remove empty strings if st.button("Submit"): if data and questions: answers = summarize_map_reduce(data, questions) st.write("Answers:") for q, a in zip(questions, answers): st.write(f"Question: {q}") st.write(f"Answer: {a}") # Add Ctrl+Enter functionality for submitting the questions st.markdown(""" """, unsafe_allow_html=True)