Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 6, 2024

Commit

9f9f9bd

verified ·

1 Parent(s): a0f12e7

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -25

app.py CHANGED Viewed

@@ -199,8 +199,8 @@ from collections import Counter
 def extract_problem_domains(df,
                             text_column='Processed_ProblemDescription_forDomainExtraction',
-                            cluster_range=(5, 25),
-                            top_words=10):
     console_messages.append("Extracting Problem Domains...")
     # Sentence Transformers approach
@@ -282,8 +282,8 @@ def text_processing_for_location(text):
 def extract_location_clusters(df,
                               text_column1='Processed_LocationText_forClustering', # Extracted through NLP
                               text_column2='Geographical_Location', # User Input
-                              cluster_range=(5, 25),
-                              top_words=5):
     # Combine the two text columns
     text_column = "Combined_Location_Text"
     df[text_column] = df[text_column1] + ' ' + df[text_column2]
@@ -362,31 +362,43 @@ from transformers import GPTNeoForCausalLM, GPT2Tokenizer
 def generate_project_proposal(prompt):
     print("Trying to access gpt-neo-1.3B")
     print("prompt: \t", prompt)
-    try:
-        # Generate the proposal
-        model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
-        tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
-    except Exception as e:
-        print("Error loading models:", str(e))
-        console_messages.append("\n Error Loading Models")
-        return prompt
     try:
-        input_ids = tokenizer.encode(prompt, return_tensors="pt")
         print("Input IDs shape:", input_ids.shape)
         output = model.generate(
-            input_ids,
-            max_length=300,
-            num_return_sequences=1,
             no_repeat_ngram_size=2,
-            temperature=0.75)
         print("Output shape:", output.shape)
         proposal = tokenizer.decode(output[0], skip_special_tokens=True)
-        print("Successfully accessed gpt-neo-1.3B and returning")
         return proposal
     except Exception as e:
         print("Error generating proposal:", str(e))
-        return prompt
@@ -411,6 +423,7 @@ def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clu
             location = ", ".join([item.strip() for item in location_clusters[loc] if item])  # Clean and join
             problem_domain = ", ".join([item.strip() for item in problem_clusters[prob] if item])  # Clean and join
             problem_descriptions = problem_cluster_df.loc[loc, prob]
             print("location: ", location)
             print("problem_domain: ", problem_domain)
@@ -418,16 +431,17 @@ def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clu
             # Check if problem_descriptions is valid (not NaN and not an empty list)
             if isinstance(problem_descriptions, list) and problem_descriptions:
-                print(f"\nGenerating proposal for location: {location}, problem domain: {problem_domain}")
                 # Prepare the prompt
-                problems_summary = "; \n".join(problem_descriptions)  # Join all problem descriptions
-                prompt = f"Generate a solution oriented project proposal for the following:\n\nLocation: {location}\nProblem Domain: {problem_domain}\nProblems: {problems_summary}\n\nProject Proposal:"
-                proposal = generate_project_proposal(prompt)
-                proposals[(loc, prob)] = proposal
-                print("Generated Proposal: ", proposal)
             else:
                 print(f"Skipping empty problem descriptions for location: {location}, problem domain: {problem_domain}")

 def extract_problem_domains(df,
                             text_column='Processed_ProblemDescription_forDomainExtraction',
+                            cluster_range=(5, 15),
+                            top_words=7):
     console_messages.append("Extracting Problem Domains...")
     # Sentence Transformers approach
 def extract_location_clusters(df,
                               text_column1='Processed_LocationText_forClustering', # Extracted through NLP
                               text_column2='Geographical_Location', # User Input
+                              cluster_range=(5, 15),
+                              top_words=3):
     # Combine the two text columns
     text_column = "Combined_Location_Text"
     df[text_column] = df[text_column1] + ' ' + df[text_column2]
 def generate_project_proposal(prompt):
     print("Trying to access gpt-neo-1.3B")
     print("prompt: \t", prompt)
+    # Generate the proposal
+    model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
+    tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
     try:
+        # input_ids = tokenizer.encode(prompt, return_tensors="pt")
+        # Truncate the prompt to fit within the model's input limits
+        max_input_length = 2048  # Adjust as per your model's limit
+        input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
         print("Input IDs shape:", input_ids.shape)
         output = model.generate(
+            input_ids,
+            # max_length=300,
+            max_new_tokens=500,
+            num_return_sequences=1,
             no_repeat_ngram_size=2,
+            temperature=0.5,
+            pad_token_id=tokenizer.eos_token_id  # Ensure padding with EOS token
+            )
         print("Output shape:", output.shape)
         proposal = tokenizer.decode(output[0], skip_special_tokens=True)
+        if "Project Proposal:" in proposal:
+            proposal = proposal.split("Project Proposal:", 1)[1].strip()
+        else:
+            proposal = proposal.strip()
+        # print("Successfully accessed gpt-neo-1.3B and returning")
+        print("Generated Proposal: ", proposal)
         return proposal
     except Exception as e:
         print("Error generating proposal:", str(e))
+        return "Hyper-local Sustainability Projects would lead to Longevity of the self and Prosperity of the community. Therefore UNSDGs coupled with Longevity initiatives should be focused upon."
             location = ", ".join([item.strip() for item in location_clusters[loc] if item])  # Clean and join
             problem_domain = ", ".join([item.strip() for item in problem_clusters[prob] if item])  # Clean and join
             problem_descriptions = problem_cluster_df.loc[loc, prob]
             print("location: ", location)
             print("problem_domain: ", problem_domain)
             # Check if problem_descriptions is valid (not NaN and not an empty list)
             if isinstance(problem_descriptions, list) and problem_descriptions:
+                # print(f"\nGenerating proposal for location: {location}, problem domain: {problem_domain}")
+                print(f"Generating PP")
                 # Prepare the prompt
+                # problems_summary = "; \n".join(problem_descriptions)  # Join all problem descriptions
+                problems_summary = "; \n".join(problem_descriptions[:3])  # Limit to first 3 for brevity
+                # prompt = f"Generate a solution oriented project proposal for the following:\n\nLocation: {location}\nProblem Domain: {problem_domain}\nProblems: {problems_summary}\n\nProject Proposal:"
+                prompt = f"Generate a solution-oriented project proposal for the following public problem (only output the proposal):\n\n Geographical/Digital Location: {location}\nProblem Category: {problem_domain}\nProblems: {problems_summary}\n\nProject Proposal:"
+                proposals[(loc, prob)] = generate_project_proposal(prompt)
             else:
                 print(f"Skipping empty problem descriptions for location: {location}, problem domain: {problem_domain}")