File size: 6,696 Bytes
30e077e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import re
import streamlit as st
from langchain_groq import ChatGroq

# Set up the title of the Streamlit app
st.title("Code Analyzer & Migrator for SQL, PL-SQL, and BTEQ Files")

# Sidebar for file upload, code language selection, and submit button
st.sidebar.header("Upload your Document")
uploaded_file = st.sidebar.file_uploader("Choose a file", type=["sql", "bteq"])
language_choice = st.sidebar.selectbox("Convert code to:", ["Select a language", "pyspark"])
submit_button = st.sidebar.button("Submit")

# Initialize the LLM with Groq API
groq_api_key = "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"
llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")


def llm_extract_sql_elements(content):
    # Request LLM for analysis of SQL elements
    prompt = (
        f"Analyze the following BTEQ code and generate a comprehensive Code Discovery Report for migration planning. "
        f"Report should include:\n\n"
        f"1. **Metric-Based Summary** - Use counts and examples:\n"
        f"   - Number of tables (e.g., `5 tables: Users, Orders, Products, Inventory, Sales`)\n"
        f"   - Number of views (e.g., `2 views: DailySales, MonthlyRevenue`)\n"
        f"   - CREATE statements count (e.g., `3 CREATE TABLE statements`)\n"
        f"   - SELECT statements count (e.g., `8 SELECT queries`)\n"
        f"   - JOIN operations count (e.g., `4 JOIN operations`)\n"
        f"   - Any syntax issues (e.g., `1 unmatched parenthesis`)\n\n"
        f"2. **Discrepancy Checks** - Identify potential issues:\n"
        f"   - Syntax issues, deprecated functions, or non-standard practices\n"
        f"   - Examples: Unclosed quotes, missing semicolons\n\n"
        f"3. **Key Observations & Learnings**:\n"
        f"   - Highlight repetitive operations or inefficiencies (e.g., `Repetitive SELECT * usage`)\n"
        f"   - Provide 2-3 short recommendations for the migration to PySpark (e.g., "
        f"`Consider using DataFrames for JOIN operations`)\n\n"
        f"Code:\n{content}"
    )
    response = llm.predict(prompt)
    return response




def llm_convert_code(content, target_language):
    # Request LLM for conversion with validation instructions
    prompt = (
        f"Convert the following BTEQ code to fully functional {target_language} code, ensuring that:\n"
        f"1. All BTEQ-specific commands are accurately translated to equivalent {target_language} constructs.\n"
        f"2. SQL operations (e.g., SELECT, JOIN, WHERE) are migrated using appropriate {target_language} libraries (e.g., PySpark SQL DataFrame API for PySpark).\n"
        f"3. All syntax and logic are correct and executable in {target_language} without modifications.\n\n"
        f"After the conversion, validate the {target_language} code to confirm its correctness.\n\n"
        f"Examples of required conversions:\n"
        f"- BTEQ SELECT statements should map to PySpark DataFrame `select()` functions.\n"
        f"- Error handling in BTEQ (e.g., `.IF ERRORCODE`) should be replaced with equivalent exception handling.\n"
        f"- Explicit casting, joins, and aggregations should follow {target_language} best practices.\n\n"
        f"Please ensure that all transformed code is structured, efficient, and ready for production use in {target_language}.\n\n"
        f"Code to convert:\n{content}"
        "example output code:"
            '''from pyspark.sql import SparkSession
            from pyspark.sql.functions import col, countDistinct
            
            # Initialize SparkSession
            spark = SparkSession.builder.appName("SimplePySparkExample").getOrCreate()
            
            # Sample data for two DataFrames
            data1 = [(1, "Alice", "Sales", 5000), (2, "Bob", "HR", 4000), (3, "Charlie", "IT", 6000)]
            data2 = [(1, "Sales", "New York"), (2, "HR", "Los Angeles"), (3, "IT", "San Francisco")]
            
            # Create DataFrames
            df1 = spark.createDataFrame(data1, ["id", "name", "department", "salary"])
            df2 = spark.createDataFrame(data2, ["dept_id", "department", "location"])
            
            # Selecting specific columns and filtering rows
            selected_df = df1.select("name", "department", "salary").filter(col("salary") > 4500)
            
            # Joining DataFrames on a common column
            joined_df = selected_df.join(df2, df1.department == df2.department, "inner")
            
            # Aggregation: Count distinct departments and get the average salary by department
            agg_df = joined_df.groupBy("department").agg(countDistinct("name").alias("distinct_names"), avg("salary").alias("avg_salary"))
            
            # Show the final result
            agg_df.show()
            
            # Stop SparkSession
            spark.stop()
            '''
        f"Note: output must contain only the converted pyspark (python format)code and no other pre or post text"
    )
    converted_code = llm.predict(prompt)
    return converted_code



# Display content and analysis if a file is uploaded and the submit button is clicked
if submit_button and uploaded_file is not None:
    # Read the uploaded file
    file_content = uploaded_file.read().decode("utf-8")

    # Display the uploaded document content in the main area
    st.subheader("Uploaded Document Content:")
    st.text_area("Document Content", file_content, height=300)

    # Basic content analysis
    st.subheader("Basic Analysis")
    line_count = len(file_content.splitlines())
    word_count = len(file_content.split())
    st.write(f"**Line Count**: {line_count}")
    # st.write(f"**Word Count**: {word_count}")

    # Extract SQL elements via LLM
    st.subheader("Code Discovery Report (LLM-Enhanced)")
    sql_analysis = llm_extract_sql_elements(file_content)
    st.write(sql_analysis)

    # Code conversion if a valid language is selected
    if language_choice != "Select a language":
        st.subheader(f"Code Conversion to {language_choice.capitalize()}")
        converted_code = llm_convert_code(file_content, language_choice)

        # Display the converted code
        st.text_area("Converted Code", converted_code, height=300)

        # Option to download the converted code as a text file
        st.download_button(
            label="Download Converted Code",
            data=converted_code,
            file_name=f"converted_code_{language_choice}.py",
            mime="text/plain"
        )

elif submit_button and uploaded_file is None:
    st.warning("Please upload a file before submitting.")
else:
    st.info("Upload a document in the sidebar, select the target language, and click Submit to analyze.")