Spaces:

karthigrj
/

xforms

Sleeping

App Files Files Community

karthigrj commited on Nov 7, 2024

Commit

30e077e

verified ·

1 Parent(s): 5d8f945

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -137

app.py CHANGED Viewed

@@ -1,137 +1,137 @@
-import re
-import streamlit as st
-from langchain_groq import ChatGroq
-# Set up the title of the Streamlit app
-st.title("Document Uploader and Analyzer for SQL, PL-SQL, and BTEQ Files")
-# Sidebar for file upload, code language selection, and submit button
-st.sidebar.header("Upload your Document")
-uploaded_file = st.sidebar.file_uploader("Choose a file", type=["sql", "bteq"])
-language_choice = st.sidebar.selectbox("Convert code to:", ["Select a language", "pyspark"])
-submit_button = st.sidebar.button("Submit")
-# Initialize the LLM with Groq API
-groq_api_key = "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"
-llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
-def llm_extract_sql_elements(content):
-    # Request LLM for analysis of SQL elements
-    prompt = (
-        f"Analyze the following BTEQ code and generate a comprehensive Code Discovery Report for migration planning. "
-        f"Report should include:\n\n"
-        f"1. **Metric-Based Summary** - Use counts and examples:\n"
-        f"   - Number of tables (e.g., `5 tables: Users, Orders, Products, Inventory, Sales`)\n"
-        f"   - Number of views (e.g., `2 views: DailySales, MonthlyRevenue`)\n"
-        f"   - CREATE statements count (e.g., `3 CREATE TABLE statements`)\n"
-        f"   - SELECT statements count (e.g., `8 SELECT queries`)\n"
-        f"   - JOIN operations count (e.g., `4 JOIN operations`)\n"
-        f"   - Any syntax issues (e.g., `1 unmatched parenthesis`)\n\n"
-        f"2. **Discrepancy Checks** - Identify potential issues:\n"
-        f"   - Syntax issues, deprecated functions, or non-standard practices\n"
-        f"   - Examples: Unclosed quotes, missing semicolons\n\n"
-        f"3. **Key Observations & Learnings**:\n"
-        f"   - Highlight repetitive operations or inefficiencies (e.g., `Repetitive SELECT * usage`)\n"
-        f"   - Provide 2-3 short recommendations for the migration to PySpark (e.g., "
-        f"`Consider using DataFrames for JOIN operations`)\n\n"
-        f"Code:\n{content}"
-    )
-    response = llm.predict(prompt)
-    return response
-def llm_convert_code(content, target_language):
-    # Request LLM for conversion with validation instructions
-    prompt = (
-        f"Convert the following BTEQ code to fully functional {target_language} code, ensuring that:\n"
-        f"1. All BTEQ-specific commands are accurately translated to equivalent {target_language} constructs.\n"
-        f"2. SQL operations (e.g., SELECT, JOIN, WHERE) are migrated using appropriate {target_language} libraries (e.g., PySpark SQL DataFrame API for PySpark).\n"
-        f"3. All syntax and logic are correct and executable in {target_language} without modifications.\n\n"
-        f"After the conversion, validate the {target_language} code to confirm its correctness.\n\n"
-        f"Examples of required conversions:\n"
-        f"- BTEQ SELECT statements should map to PySpark DataFrame `select()` functions.\n"
-        f"- Error handling in BTEQ (e.g., `.IF ERRORCODE`) should be replaced with equivalent exception handling.\n"
-        f"- Explicit casting, joins, and aggregations should follow {target_language} best practices.\n\n"
-        f"Please ensure that all transformed code is structured, efficient, and ready for production use in {target_language}.\n\n"
-        f"Code to convert:\n{content}"
-        "example output code:"
-            '''from pyspark.sql import SparkSession
-            from pyspark.sql.functions import col, countDistinct
-            # Initialize SparkSession
-            spark = SparkSession.builder.appName("SimplePySparkExample").getOrCreate()
-            # Sample data for two DataFrames
-            data1 = [(1, "Alice", "Sales", 5000), (2, "Bob", "HR", 4000), (3, "Charlie", "IT", 6000)]
-            data2 = [(1, "Sales", "New York"), (2, "HR", "Los Angeles"), (3, "IT", "San Francisco")]
-            # Create DataFrames
-            df1 = spark.createDataFrame(data1, ["id", "name", "department", "salary"])
-            df2 = spark.createDataFrame(data2, ["dept_id", "department", "location"])
-            # Selecting specific columns and filtering rows
-            selected_df = df1.select("name", "department", "salary").filter(col("salary") > 4500)
-            # Joining DataFrames on a common column
-            joined_df = selected_df.join(df2, df1.department == df2.department, "inner")
-            # Aggregation: Count distinct departments and get the average salary by department
-            agg_df = joined_df.groupBy("department").agg(countDistinct("name").alias("distinct_names"), avg("salary").alias("avg_salary"))
-            # Show the final result
-            agg_df.show()
-            # Stop SparkSession
-            spark.stop()
-            '''
-        f"Note: output must contain only the converted pyspark (python format)code and no other pre or post text"
-    )
-    converted_code = llm.predict(prompt)
-    return converted_code
-# Display content and analysis if a file is uploaded and the submit button is clicked
-if submit_button and uploaded_file is not None:
-    # Read the uploaded file
-    file_content = uploaded_file.read().decode("utf-8")
-    # Display the uploaded document content in the main area
-    st.subheader("Uploaded Document Content:")
-    st.text_area("Document Content", file_content, height=300)
-    # Basic content analysis
-    st.subheader("Basic Analysis")
-    line_count = len(file_content.splitlines())
-    word_count = len(file_content.split())
-    st.write(f"**Line Count**: {line_count}")
-    # st.write(f"**Word Count**: {word_count}")
-    # Extract SQL elements via LLM
-    st.subheader("Code Discovery Report (LLM-Enhanced)")
-    sql_analysis = llm_extract_sql_elements(file_content)
-    st.write(sql_analysis)
-    # Code conversion if a valid language is selected
-    if language_choice != "Select a language":
-        st.subheader(f"Code Conversion to {language_choice.capitalize()}")
-        converted_code = llm_convert_code(file_content, language_choice)
-        # Display the converted code
-        st.text_area("Converted Code", converted_code, height=300)
-        # Option to download the converted code as a text file
-        st.download_button(
-            label="Download Converted Code",
-            data=converted_code,
-            file_name=f"converted_code_{language_choice}.py",
-            mime="text/plain"
-        )
-elif submit_button and uploaded_file is None:
-    st.warning("Please upload a file before submitting.")
-else:
-    st.info("Upload a document in the sidebar, select the target language, and click Submit to analyze.")

+import re
+import streamlit as st
+from langchain_groq import ChatGroq
+# Set up the title of the Streamlit app
+st.title("Code Analyzer & Migrator for SQL, PL-SQL, and BTEQ Files")
+# Sidebar for file upload, code language selection, and submit button
+st.sidebar.header("Upload your Document")
+uploaded_file = st.sidebar.file_uploader("Choose a file", type=["sql", "bteq"])
+language_choice = st.sidebar.selectbox("Convert code to:", ["Select a language", "pyspark"])
+submit_button = st.sidebar.button("Submit")
+# Initialize the LLM with Groq API
+groq_api_key = "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"
+llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")
+def llm_extract_sql_elements(content):
+    # Request LLM for analysis of SQL elements
+    prompt = (
+        f"Analyze the following BTEQ code and generate a comprehensive Code Discovery Report for migration planning. "
+        f"Report should include:\n\n"
+        f"1. **Metric-Based Summary** - Use counts and examples:\n"
+        f"   - Number of tables (e.g., `5 tables: Users, Orders, Products, Inventory, Sales`)\n"
+        f"   - Number of views (e.g., `2 views: DailySales, MonthlyRevenue`)\n"
+        f"   - CREATE statements count (e.g., `3 CREATE TABLE statements`)\n"
+        f"   - SELECT statements count (e.g., `8 SELECT queries`)\n"
+        f"   - JOIN operations count (e.g., `4 JOIN operations`)\n"
+        f"   - Any syntax issues (e.g., `1 unmatched parenthesis`)\n\n"
+        f"2. **Discrepancy Checks** - Identify potential issues:\n"
+        f"   - Syntax issues, deprecated functions, or non-standard practices\n"
+        f"   - Examples: Unclosed quotes, missing semicolons\n\n"
+        f"3. **Key Observations & Learnings**:\n"
+        f"   - Highlight repetitive operations or inefficiencies (e.g., `Repetitive SELECT * usage`)\n"
+        f"   - Provide 2-3 short recommendations for the migration to PySpark (e.g., "
+        f"`Consider using DataFrames for JOIN operations`)\n\n"
+        f"Code:\n{content}"
+    )
+    response = llm.predict(prompt)
+    return response
+def llm_convert_code(content, target_language):
+    # Request LLM for conversion with validation instructions
+    prompt = (
+        f"Convert the following BTEQ code to fully functional {target_language} code, ensuring that:\n"
+        f"1. All BTEQ-specific commands are accurately translated to equivalent {target_language} constructs.\n"
+        f"2. SQL operations (e.g., SELECT, JOIN, WHERE) are migrated using appropriate {target_language} libraries (e.g., PySpark SQL DataFrame API for PySpark).\n"
+        f"3. All syntax and logic are correct and executable in {target_language} without modifications.\n\n"
+        f"After the conversion, validate the {target_language} code to confirm its correctness.\n\n"
+        f"Examples of required conversions:\n"
+        f"- BTEQ SELECT statements should map to PySpark DataFrame `select()` functions.\n"
+        f"- Error handling in BTEQ (e.g., `.IF ERRORCODE`) should be replaced with equivalent exception handling.\n"
+        f"- Explicit casting, joins, and aggregations should follow {target_language} best practices.\n\n"
+        f"Please ensure that all transformed code is structured, efficient, and ready for production use in {target_language}.\n\n"
+        f"Code to convert:\n{content}"
+        "example output code:"
+            '''from pyspark.sql import SparkSession
+            from pyspark.sql.functions import col, countDistinct
+            # Initialize SparkSession
+            spark = SparkSession.builder.appName("SimplePySparkExample").getOrCreate()
+            # Sample data for two DataFrames
+            data1 = [(1, "Alice", "Sales", 5000), (2, "Bob", "HR", 4000), (3, "Charlie", "IT", 6000)]
+            data2 = [(1, "Sales", "New York"), (2, "HR", "Los Angeles"), (3, "IT", "San Francisco")]
+            # Create DataFrames
+            df1 = spark.createDataFrame(data1, ["id", "name", "department", "salary"])
+            df2 = spark.createDataFrame(data2, ["dept_id", "department", "location"])
+            # Selecting specific columns and filtering rows
+            selected_df = df1.select("name", "department", "salary").filter(col("salary") > 4500)
+            # Joining DataFrames on a common column
+            joined_df = selected_df.join(df2, df1.department == df2.department, "inner")
+            # Aggregation: Count distinct departments and get the average salary by department
+            agg_df = joined_df.groupBy("department").agg(countDistinct("name").alias("distinct_names"), avg("salary").alias("avg_salary"))
+            # Show the final result
+            agg_df.show()
+            # Stop SparkSession
+            spark.stop()
+            '''
+        f"Note: output must contain only the converted pyspark (python format)code and no other pre or post text"
+    )
+    converted_code = llm.predict(prompt)
+    return converted_code
+# Display content and analysis if a file is uploaded and the submit button is clicked
+if submit_button and uploaded_file is not None:
+    # Read the uploaded file
+    file_content = uploaded_file.read().decode("utf-8")
+    # Display the uploaded document content in the main area
+    st.subheader("Uploaded Document Content:")
+    st.text_area("Document Content", file_content, height=300)
+    # Basic content analysis
+    st.subheader("Basic Analysis")
+    line_count = len(file_content.splitlines())
+    word_count = len(file_content.split())
+    st.write(f"**Line Count**: {line_count}")
+    # st.write(f"**Word Count**: {word_count}")
+    # Extract SQL elements via LLM
+    st.subheader("Code Discovery Report (LLM-Enhanced)")
+    sql_analysis = llm_extract_sql_elements(file_content)
+    st.write(sql_analysis)
+    # Code conversion if a valid language is selected
+    if language_choice != "Select a language":
+        st.subheader(f"Code Conversion to {language_choice.capitalize()}")
+        converted_code = llm_convert_code(file_content, language_choice)
+        # Display the converted code
+        st.text_area("Converted Code", converted_code, height=300)
+        # Option to download the converted code as a text file
+        st.download_button(
+            label="Download Converted Code",
+            data=converted_code,
+            file_name=f"converted_code_{language_choice}.py",
+            mime="text/plain"
+        )
+elif submit_button and uploaded_file is None:
+    st.warning("Please upload a file before submitting.")
+else:
+    st.info("Upload a document in the sidebar, select the target language, and click Submit to analyze.")