karthigrj commited on
Commit
30e077e
·
verified ·
1 Parent(s): 5d8f945

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -137
app.py CHANGED
@@ -1,137 +1,137 @@
1
- import re
2
- import streamlit as st
3
- from langchain_groq import ChatGroq
4
-
5
- # Set up the title of the Streamlit app
6
- st.title("Document Uploader and Analyzer for SQL, PL-SQL, and BTEQ Files")
7
-
8
- # Sidebar for file upload, code language selection, and submit button
9
- st.sidebar.header("Upload your Document")
10
- uploaded_file = st.sidebar.file_uploader("Choose a file", type=["sql", "bteq"])
11
- language_choice = st.sidebar.selectbox("Convert code to:", ["Select a language", "pyspark"])
12
- submit_button = st.sidebar.button("Submit")
13
-
14
- # Initialize the LLM with Groq API
15
- groq_api_key = "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"
16
- llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
17
-
18
-
19
- def llm_extract_sql_elements(content):
20
- # Request LLM for analysis of SQL elements
21
- prompt = (
22
- f"Analyze the following BTEQ code and generate a comprehensive Code Discovery Report for migration planning. "
23
- f"Report should include:\n\n"
24
- f"1. **Metric-Based Summary** - Use counts and examples:\n"
25
- f" - Number of tables (e.g., `5 tables: Users, Orders, Products, Inventory, Sales`)\n"
26
- f" - Number of views (e.g., `2 views: DailySales, MonthlyRevenue`)\n"
27
- f" - CREATE statements count (e.g., `3 CREATE TABLE statements`)\n"
28
- f" - SELECT statements count (e.g., `8 SELECT queries`)\n"
29
- f" - JOIN operations count (e.g., `4 JOIN operations`)\n"
30
- f" - Any syntax issues (e.g., `1 unmatched parenthesis`)\n\n"
31
- f"2. **Discrepancy Checks** - Identify potential issues:\n"
32
- f" - Syntax issues, deprecated functions, or non-standard practices\n"
33
- f" - Examples: Unclosed quotes, missing semicolons\n\n"
34
- f"3. **Key Observations & Learnings**:\n"
35
- f" - Highlight repetitive operations or inefficiencies (e.g., `Repetitive SELECT * usage`)\n"
36
- f" - Provide 2-3 short recommendations for the migration to PySpark (e.g., "
37
- f"`Consider using DataFrames for JOIN operations`)\n\n"
38
- f"Code:\n{content}"
39
- )
40
- response = llm.predict(prompt)
41
- return response
42
-
43
-
44
-
45
-
46
- def llm_convert_code(content, target_language):
47
- # Request LLM for conversion with validation instructions
48
- prompt = (
49
- f"Convert the following BTEQ code to fully functional {target_language} code, ensuring that:\n"
50
- f"1. All BTEQ-specific commands are accurately translated to equivalent {target_language} constructs.\n"
51
- f"2. SQL operations (e.g., SELECT, JOIN, WHERE) are migrated using appropriate {target_language} libraries (e.g., PySpark SQL DataFrame API for PySpark).\n"
52
- f"3. All syntax and logic are correct and executable in {target_language} without modifications.\n\n"
53
- f"After the conversion, validate the {target_language} code to confirm its correctness.\n\n"
54
- f"Examples of required conversions:\n"
55
- f"- BTEQ SELECT statements should map to PySpark DataFrame `select()` functions.\n"
56
- f"- Error handling in BTEQ (e.g., `.IF ERRORCODE`) should be replaced with equivalent exception handling.\n"
57
- f"- Explicit casting, joins, and aggregations should follow {target_language} best practices.\n\n"
58
- f"Please ensure that all transformed code is structured, efficient, and ready for production use in {target_language}.\n\n"
59
- f"Code to convert:\n{content}"
60
- "example output code:"
61
- '''from pyspark.sql import SparkSession
62
- from pyspark.sql.functions import col, countDistinct
63
-
64
- # Initialize SparkSession
65
- spark = SparkSession.builder.appName("SimplePySparkExample").getOrCreate()
66
-
67
- # Sample data for two DataFrames
68
- data1 = [(1, "Alice", "Sales", 5000), (2, "Bob", "HR", 4000), (3, "Charlie", "IT", 6000)]
69
- data2 = [(1, "Sales", "New York"), (2, "HR", "Los Angeles"), (3, "IT", "San Francisco")]
70
-
71
- # Create DataFrames
72
- df1 = spark.createDataFrame(data1, ["id", "name", "department", "salary"])
73
- df2 = spark.createDataFrame(data2, ["dept_id", "department", "location"])
74
-
75
- # Selecting specific columns and filtering rows
76
- selected_df = df1.select("name", "department", "salary").filter(col("salary") > 4500)
77
-
78
- # Joining DataFrames on a common column
79
- joined_df = selected_df.join(df2, df1.department == df2.department, "inner")
80
-
81
- # Aggregation: Count distinct departments and get the average salary by department
82
- agg_df = joined_df.groupBy("department").agg(countDistinct("name").alias("distinct_names"), avg("salary").alias("avg_salary"))
83
-
84
- # Show the final result
85
- agg_df.show()
86
-
87
- # Stop SparkSession
88
- spark.stop()
89
- '''
90
- f"Note: output must contain only the converted pyspark (python format)code and no other pre or post text"
91
- )
92
- converted_code = llm.predict(prompt)
93
- return converted_code
94
-
95
-
96
-
97
- # Display content and analysis if a file is uploaded and the submit button is clicked
98
- if submit_button and uploaded_file is not None:
99
- # Read the uploaded file
100
- file_content = uploaded_file.read().decode("utf-8")
101
-
102
- # Display the uploaded document content in the main area
103
- st.subheader("Uploaded Document Content:")
104
- st.text_area("Document Content", file_content, height=300)
105
-
106
- # Basic content analysis
107
- st.subheader("Basic Analysis")
108
- line_count = len(file_content.splitlines())
109
- word_count = len(file_content.split())
110
- st.write(f"**Line Count**: {line_count}")
111
- # st.write(f"**Word Count**: {word_count}")
112
-
113
- # Extract SQL elements via LLM
114
- st.subheader("Code Discovery Report (LLM-Enhanced)")
115
- sql_analysis = llm_extract_sql_elements(file_content)
116
- st.write(sql_analysis)
117
-
118
- # Code conversion if a valid language is selected
119
- if language_choice != "Select a language":
120
- st.subheader(f"Code Conversion to {language_choice.capitalize()}")
121
- converted_code = llm_convert_code(file_content, language_choice)
122
-
123
- # Display the converted code
124
- st.text_area("Converted Code", converted_code, height=300)
125
-
126
- # Option to download the converted code as a text file
127
- st.download_button(
128
- label="Download Converted Code",
129
- data=converted_code,
130
- file_name=f"converted_code_{language_choice}.py",
131
- mime="text/plain"
132
- )
133
-
134
- elif submit_button and uploaded_file is None:
135
- st.warning("Please upload a file before submitting.")
136
- else:
137
- st.info("Upload a document in the sidebar, select the target language, and click Submit to analyze.")
 
1
+ import re
2
+ import streamlit as st
3
+ from langchain_groq import ChatGroq
4
+
5
+ # Set up the title of the Streamlit app
6
+ st.title("Code Analyzer & Migrator for SQL, PL-SQL, and BTEQ Files")
7
+
8
+ # Sidebar for file upload, code language selection, and submit button
9
+ st.sidebar.header("Upload your Document")
10
+ uploaded_file = st.sidebar.file_uploader("Choose a file", type=["sql", "bteq"])
11
+ language_choice = st.sidebar.selectbox("Convert code to:", ["Select a language", "pyspark"])
12
+ submit_button = st.sidebar.button("Submit")
13
+
14
+ # Initialize the LLM with Groq API
15
+ groq_api_key = "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"
16
+ llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
17
+
18
+
19
+ def llm_extract_sql_elements(content):
20
+ # Request LLM for analysis of SQL elements
21
+ prompt = (
22
+ f"Analyze the following BTEQ code and generate a comprehensive Code Discovery Report for migration planning. "
23
+ f"Report should include:\n\n"
24
+ f"1. **Metric-Based Summary** - Use counts and examples:\n"
25
+ f" - Number of tables (e.g., `5 tables: Users, Orders, Products, Inventory, Sales`)\n"
26
+ f" - Number of views (e.g., `2 views: DailySales, MonthlyRevenue`)\n"
27
+ f" - CREATE statements count (e.g., `3 CREATE TABLE statements`)\n"
28
+ f" - SELECT statements count (e.g., `8 SELECT queries`)\n"
29
+ f" - JOIN operations count (e.g., `4 JOIN operations`)\n"
30
+ f" - Any syntax issues (e.g., `1 unmatched parenthesis`)\n\n"
31
+ f"2. **Discrepancy Checks** - Identify potential issues:\n"
32
+ f" - Syntax issues, deprecated functions, or non-standard practices\n"
33
+ f" - Examples: Unclosed quotes, missing semicolons\n\n"
34
+ f"3. **Key Observations & Learnings**:\n"
35
+ f" - Highlight repetitive operations or inefficiencies (e.g., `Repetitive SELECT * usage`)\n"
36
+ f" - Provide 2-3 short recommendations for the migration to PySpark (e.g., "
37
+ f"`Consider using DataFrames for JOIN operations`)\n\n"
38
+ f"Code:\n{content}"
39
+ )
40
+ response = llm.predict(prompt)
41
+ return response
42
+
43
+
44
+
45
+
46
+ def llm_convert_code(content, target_language):
47
+ # Request LLM for conversion with validation instructions
48
+ prompt = (
49
+ f"Convert the following BTEQ code to fully functional {target_language} code, ensuring that:\n"
50
+ f"1. All BTEQ-specific commands are accurately translated to equivalent {target_language} constructs.\n"
51
+ f"2. SQL operations (e.g., SELECT, JOIN, WHERE) are migrated using appropriate {target_language} libraries (e.g., PySpark SQL DataFrame API for PySpark).\n"
52
+ f"3. All syntax and logic are correct and executable in {target_language} without modifications.\n\n"
53
+ f"After the conversion, validate the {target_language} code to confirm its correctness.\n\n"
54
+ f"Examples of required conversions:\n"
55
+ f"- BTEQ SELECT statements should map to PySpark DataFrame `select()` functions.\n"
56
+ f"- Error handling in BTEQ (e.g., `.IF ERRORCODE`) should be replaced with equivalent exception handling.\n"
57
+ f"- Explicit casting, joins, and aggregations should follow {target_language} best practices.\n\n"
58
+ f"Please ensure that all transformed code is structured, efficient, and ready for production use in {target_language}.\n\n"
59
+ f"Code to convert:\n{content}"
60
+ "example output code:"
61
+ '''from pyspark.sql import SparkSession
62
+ from pyspark.sql.functions import col, countDistinct
63
+
64
+ # Initialize SparkSession
65
+ spark = SparkSession.builder.appName("SimplePySparkExample").getOrCreate()
66
+
67
+ # Sample data for two DataFrames
68
+ data1 = [(1, "Alice", "Sales", 5000), (2, "Bob", "HR", 4000), (3, "Charlie", "IT", 6000)]
69
+ data2 = [(1, "Sales", "New York"), (2, "HR", "Los Angeles"), (3, "IT", "San Francisco")]
70
+
71
+ # Create DataFrames
72
+ df1 = spark.createDataFrame(data1, ["id", "name", "department", "salary"])
73
+ df2 = spark.createDataFrame(data2, ["dept_id", "department", "location"])
74
+
75
+ # Selecting specific columns and filtering rows
76
+ selected_df = df1.select("name", "department", "salary").filter(col("salary") > 4500)
77
+
78
+ # Joining DataFrames on a common column
79
+ joined_df = selected_df.join(df2, df1.department == df2.department, "inner")
80
+
81
+ # Aggregation: Count distinct departments and get the average salary by department
82
+ agg_df = joined_df.groupBy("department").agg(countDistinct("name").alias("distinct_names"), avg("salary").alias("avg_salary"))
83
+
84
+ # Show the final result
85
+ agg_df.show()
86
+
87
+ # Stop SparkSession
88
+ spark.stop()
89
+ '''
90
+ f"Note: output must contain only the converted pyspark (python format)code and no other pre or post text"
91
+ )
92
+ converted_code = llm.predict(prompt)
93
+ return converted_code
94
+
95
+
96
+
97
+ # Display content and analysis if a file is uploaded and the submit button is clicked
98
+ if submit_button and uploaded_file is not None:
99
+ # Read the uploaded file
100
+ file_content = uploaded_file.read().decode("utf-8")
101
+
102
+ # Display the uploaded document content in the main area
103
+ st.subheader("Uploaded Document Content:")
104
+ st.text_area("Document Content", file_content, height=300)
105
+
106
+ # Basic content analysis
107
+ st.subheader("Basic Analysis")
108
+ line_count = len(file_content.splitlines())
109
+ word_count = len(file_content.split())
110
+ st.write(f"**Line Count**: {line_count}")
111
+ # st.write(f"**Word Count**: {word_count}")
112
+
113
+ # Extract SQL elements via LLM
114
+ st.subheader("Code Discovery Report (LLM-Enhanced)")
115
+ sql_analysis = llm_extract_sql_elements(file_content)
116
+ st.write(sql_analysis)
117
+
118
+ # Code conversion if a valid language is selected
119
+ if language_choice != "Select a language":
120
+ st.subheader(f"Code Conversion to {language_choice.capitalize()}")
121
+ converted_code = llm_convert_code(file_content, language_choice)
122
+
123
+ # Display the converted code
124
+ st.text_area("Converted Code", converted_code, height=300)
125
+
126
+ # Option to download the converted code as a text file
127
+ st.download_button(
128
+ label="Download Converted Code",
129
+ data=converted_code,
130
+ file_name=f"converted_code_{language_choice}.py",
131
+ mime="text/plain"
132
+ )
133
+
134
+ elif submit_button and uploaded_file is None:
135
+ st.warning("Please upload a file before submitting.")
136
+ else:
137
+ st.info("Upload a document in the sidebar, select the target language, and click Submit to analyze.")