|
|
import re |
|
|
import streamlit as st |
|
|
from langchain_groq import ChatGroq |
|
|
|
|
|
|
|
|
st.title("Code Analyzer & Migrator for SQL, PL-SQL, and BTEQ Files") |
|
|
|
|
|
|
|
|
st.sidebar.header("Upload your Document") |
|
|
uploaded_file = st.sidebar.file_uploader("Choose a file", type=["sql", "bteq"]) |
|
|
language_choice = st.sidebar.selectbox("Convert code to:", ["Select a language", "pyspark"]) |
|
|
submit_button = st.sidebar.button("Submit") |
|
|
|
|
|
|
|
|
groq_api_key = "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" |
|
|
llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It") |
|
|
|
|
|
|
|
|
def llm_extract_sql_elements(content): |
|
|
|
|
|
prompt = ( |
|
|
f"Analyze the following BTEQ code and generate a comprehensive Code Discovery Report for migration planning. " |
|
|
f"Report should include:\n\n" |
|
|
f"1. **Metric-Based Summary** - Use counts and examples:\n" |
|
|
f" - Number of tables (e.g., `5 tables: Users, Orders, Products, Inventory, Sales`)\n" |
|
|
f" - Number of views (e.g., `2 views: DailySales, MonthlyRevenue`)\n" |
|
|
f" - CREATE statements count (e.g., `3 CREATE TABLE statements`)\n" |
|
|
f" - SELECT statements count (e.g., `8 SELECT queries`)\n" |
|
|
f" - JOIN operations count (e.g., `4 JOIN operations`)\n" |
|
|
f" - Any syntax issues (e.g., `1 unmatched parenthesis`)\n\n" |
|
|
f"2. **Discrepancy Checks** - Identify potential issues:\n" |
|
|
f" - Syntax issues, deprecated functions, or non-standard practices\n" |
|
|
f" - Examples: Unclosed quotes, missing semicolons\n\n" |
|
|
f"3. **Key Observations & Learnings**:\n" |
|
|
f" - Highlight repetitive operations or inefficiencies (e.g., `Repetitive SELECT * usage`)\n" |
|
|
f" - Provide 2-3 short recommendations for the migration to PySpark (e.g., " |
|
|
f"`Consider using DataFrames for JOIN operations`)\n\n" |
|
|
f"Code:\n{content}" |
|
|
) |
|
|
response = llm.predict(prompt) |
|
|
return response |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def llm_convert_code(content, target_language): |
|
|
|
|
|
prompt = ( |
|
|
f"Convert the following BTEQ code to fully functional {target_language} code, ensuring that:\n" |
|
|
f"1. All BTEQ-specific commands are accurately translated to equivalent {target_language} constructs.\n" |
|
|
f"2. SQL operations (e.g., SELECT, JOIN, WHERE) are migrated using appropriate {target_language} libraries (e.g., PySpark SQL DataFrame API for PySpark).\n" |
|
|
f"3. All syntax and logic are correct and executable in {target_language} without modifications.\n\n" |
|
|
f"After the conversion, validate the {target_language} code to confirm its correctness.\n\n" |
|
|
f"Examples of required conversions:\n" |
|
|
f"- BTEQ SELECT statements should map to PySpark DataFrame `select()` functions.\n" |
|
|
f"- Error handling in BTEQ (e.g., `.IF ERRORCODE`) should be replaced with equivalent exception handling.\n" |
|
|
f"- Explicit casting, joins, and aggregations should follow {target_language} best practices.\n\n" |
|
|
f"Please ensure that all transformed code is structured, efficient, and ready for production use in {target_language}.\n\n" |
|
|
f"Code to convert:\n{content}" |
|
|
"example output code:" |
|
|
'''from pyspark.sql import SparkSession |
|
|
from pyspark.sql.functions import col, countDistinct |
|
|
|
|
|
# Initialize SparkSession |
|
|
spark = SparkSession.builder.appName("SimplePySparkExample").getOrCreate() |
|
|
|
|
|
# Sample data for two DataFrames |
|
|
data1 = [(1, "Alice", "Sales", 5000), (2, "Bob", "HR", 4000), (3, "Charlie", "IT", 6000)] |
|
|
data2 = [(1, "Sales", "New York"), (2, "HR", "Los Angeles"), (3, "IT", "San Francisco")] |
|
|
|
|
|
# Create DataFrames |
|
|
df1 = spark.createDataFrame(data1, ["id", "name", "department", "salary"]) |
|
|
df2 = spark.createDataFrame(data2, ["dept_id", "department", "location"]) |
|
|
|
|
|
# Selecting specific columns and filtering rows |
|
|
selected_df = df1.select("name", "department", "salary").filter(col("salary") > 4500) |
|
|
|
|
|
# Joining DataFrames on a common column |
|
|
joined_df = selected_df.join(df2, df1.department == df2.department, "inner") |
|
|
|
|
|
# Aggregation: Count distinct departments and get the average salary by department |
|
|
agg_df = joined_df.groupBy("department").agg(countDistinct("name").alias("distinct_names"), avg("salary").alias("avg_salary")) |
|
|
|
|
|
# Show the final result |
|
|
agg_df.show() |
|
|
|
|
|
# Stop SparkSession |
|
|
spark.stop() |
|
|
''' |
|
|
f"Note: output must contain only the converted pyspark (python format)code and no other pre or post text" |
|
|
) |
|
|
converted_code = llm.predict(prompt) |
|
|
return converted_code |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if submit_button and uploaded_file is not None: |
|
|
|
|
|
file_content = uploaded_file.read().decode("utf-8") |
|
|
|
|
|
|
|
|
st.subheader("Uploaded Document Content:") |
|
|
st.text_area("Document Content", file_content, height=300) |
|
|
|
|
|
|
|
|
st.subheader("Basic Analysis") |
|
|
line_count = len(file_content.splitlines()) |
|
|
word_count = len(file_content.split()) |
|
|
st.write(f"**Line Count**: {line_count}") |
|
|
|
|
|
|
|
|
|
|
|
st.subheader("Code Discovery Report (LLM-Enhanced)") |
|
|
sql_analysis = llm_extract_sql_elements(file_content) |
|
|
st.write(sql_analysis) |
|
|
|
|
|
|
|
|
if language_choice != "Select a language": |
|
|
st.subheader(f"Code Conversion to {language_choice.capitalize()}") |
|
|
converted_code = llm_convert_code(file_content, language_choice) |
|
|
|
|
|
|
|
|
st.text_area("Converted Code", converted_code, height=300) |
|
|
|
|
|
|
|
|
st.download_button( |
|
|
label="Download Converted Code", |
|
|
data=converted_code, |
|
|
file_name=f"converted_code_{language_choice}.py", |
|
|
mime="text/plain" |
|
|
) |
|
|
|
|
|
elif submit_button and uploaded_file is None: |
|
|
st.warning("Please upload a file before submitting.") |
|
|
else: |
|
|
st.info("Upload a document in the sidebar, select the target language, and click Submit to analyze.") |
|
|
|