Spaces:

Manasa1
/

tweets_clone

Build error

App Files Files Community

Update app.py

by Manasa1 - opened Dec 14, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+316

-8

Files changed (1) hide show

app.py +316 -8

app.py CHANGED Viewed

@@ -9,10 +9,9 @@ from dotenv import load_dotenv
 load_dotenv()
 class TweetDatasetProcessor:
-    def __init__(self, fine_tuned_model_name, pdf_path):
         self.tweets = []
         self.personality_profile = {}
-        self.vectorizer = None  # No need for vectorizer here since we're not clustering
         self.used_tweets = set()  # Track used tweets to avoid repetition
         self.pdf_path = pdf_path
@@ -35,6 +34,8 @@ class TweetDatasetProcessor:
     def extract_text_from_pdf(self):
         """Extract text content from PDF file."""
         reader = PdfReader(self.pdf_path)
         text = ""
         for page in reader.pages:
@@ -111,24 +112,331 @@ class TweetDatasetProcessor:
         return generated_tweet
 # Gradio Interface Function
-def gradio_interface():
-    # Path to the PDF with tweets
-    pdf_path = 'Dataset (4).pdf'  # Replace with your PDF file path
     fine_tuned_model_name = 'Manasa1/GPT2_Finetuned_tweets'  # Replace with the path to your fine-tuned model
-    processor = TweetDatasetProcessor(fine_tuned_model_name, pdf_path)
     text = processor.extract_text_from_pdf()
     tweets = processor.process_pdf_content(text)
     personality_analysis = processor.analyze_personality(max_tweets=50)
-    generated_tweet = processor.generate_tweet(context="AI-powered tweet generation", sample_size=3)
     return personality_analysis, generated_tweet
 # Gradio app setup
 iface = gr.Interface(
     fn=gradio_interface,
-    inputs=[],
     outputs=[
         gr.Textbox(label="Personality Analysis"),
         gr.Textbox(label="Generated Tweet")

 load_dotenv()
 class TweetDatasetProcessor:
+    def __init__(self, fine_tuned_model_name, pdf_path=None):
         self.tweets = []
         self.personality_profile = {}
         self.used_tweets = set()  # Track used tweets to avoid repetition
         self.pdf_path = pdf_path
     def extract_text_from_pdf(self):
         """Extract text content from PDF file."""
+        if not self.pdf_path:
+            return ""
         reader = PdfReader(self.pdf_path)
         text = ""
         for page in reader.pages:
         return generated_tweet
 # Gradio Interface Function
+def gradio_interface(pdf_file, context="AI-powered tweet generation"):
+    # Initialize the processor with uploaded PDF path
     fine_tuned_model_name = 'Manasa1/GPT2_Finetuned_tweets'  # Replace with the path to your fine-tuned model
+    processor = TweetDatasetProcessor(fine_tuned_model_name, pdf_path=pdf_file.name)
+    # Extract text from PDF and process it
+    text = processor.extract_text_from_pdf()
+    tweets = processor.process_pdf_content(text)
+    # Analyze personality based on tweets
+    personality_analysis = processor.analyze_personality(max_tweets=50)
+    # Generate tweet based on the personality analysis and context
+    generated_tweet = processor.generate_tweet(context=context, sample_size=3)
+    return personality_analysis, generated_tweet
+# Gradio app setup
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[
+        gr.File(label="Upload PDF with Tweets"),
+        gr.Textbox(label="Context for Tweet Generation (optional)", placeholder="e.g., AI-powered tweet generation")
+    ],
+    outputs=[
+        gr.Textbox(label="Personality Analysis"),
+        gr.Textbox(label="Generated Tweet")
+    ],
+    live=True,
+    title="AI Personality and Tweet Generation",
+    description="Automatically analyze personality and generate tweets based on a provided PDF of tweets."
+)
+# Launch the app
+if __name__ == "__main__":
+    iface.launch()
+import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import random
+from datetime import datetime
+from PyPDF2 import PdfReader
+import json
+from dotenv import load_dotenv
+load_dotenv()
+class TweetDatasetProcessor:
+    def __init__(self, fine_tuned_model_name, pdf_path=None):
+        self.tweets = []
+        self.personality_profile = {}
+        self.used_tweets = set()  # Track used tweets to avoid repetition
+        self.pdf_path = pdf_path
+        # Load fine-tuned model and tokenizer
+        self.model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name)
+    @staticmethod
+    def _process_line(line):
+        """Process a single line."""
+        line = line.strip()
+        if not line or line.startswith('http'):  # Skip empty lines and URLs
+            return None
+        return {
+            'content': line,
+            'timestamp': datetime.now(),
+            'mentions': [word for word in line.split() if word.startswith('@')],
+            'hashtags': [word for word in line.split() if word.startswith('#')]
+        }
+    def extract_text_from_pdf(self):
+        """Extract text content from PDF file."""
+        if not self.pdf_path:
+            return ""
+        reader = PdfReader(self.pdf_path)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+        return text
+    def process_pdf_content(self, text):
+        """Process PDF content and clean extracted tweets."""
+        if not text.strip():
+            raise ValueError("The provided PDF appears to be empty.")
+        lines = text.split('\n')
+        clean_tweets = [TweetDatasetProcessor._process_line(line) for line in lines]
+        self.tweets = [tweet for tweet in clean_tweets if tweet]
+        if not self.tweets:
+            raise ValueError("No tweets were extracted from the PDF. Ensure the content is properly formatted.")
+        return self.tweets
+    def analyze_personality(self, max_tweets=50):
+        """Comprehensive personality analysis using a limited subset of tweets."""
+        if not self.tweets:
+            raise ValueError("No tweets available for personality analysis.")
+        all_tweets = [tweet['content'] for tweet in self.tweets][:max_tweets]
+        analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets:
+        Core beliefs, emotional tendencies, cognitive patterns, etc.
+        Tweets for analysis:
+        {json.dumps(all_tweets, indent=2)}
+        """
+        input_ids = self.tokenizer.encode(analysis_prompt, return_tensors='pt')
+        output = self.model.generate(input_ids, max_length=500, num_return_sequences=1, temperature=0.7)
+        personality_analysis = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        self.personality_profile = personality_analysis
+        return self.personality_profile
+    def generate_tweet(self, context="", sample_size=3):
+        """Generate a new tweet by sampling random tweets and avoiding repetition."""
+        if not self.tweets:
+            return "Error: No tweets available for generation."
+        # Randomly sample unique tweets
+        available_tweets = [tweet for tweet in self.tweets if tweet['content'] not in self.used_tweets]
+        if len(available_tweets) < sample_size:
+            self.used_tweets.clear()  # Reset used tweets if all have been used
+            available_tweets = self.tweets
+        sampled_tweets = random.sample(available_tweets, sample_size)
+        sampled_contents = [tweet['content'] for tweet in sampled_tweets]
+        # Update the used tweets tracker
+        self.used_tweets.update(sampled_contents)
+        # Truncate personality profile to avoid token overflow
+        personality_profile_excerpt = self.personality_profile[:400] if len(self.personality_profile) > 400 else self.personality_profile
+        # Construct the prompt
+        prompt = f"""Based on this personality profile:
+        {personality_profile_excerpt}
+        Current context or topic (if any):
+        {context}
+        Tweets for context:
+        {', '.join(sampled_contents)}
+        **Only generate the tweet. Do not include analysis, explanation, or any other content.**
+        """
+        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
+        output = self.model.generate(input_ids, max_length=150, num_return_sequences=1, temperature=1.0)
+        generated_tweet = self.tokenizer.decode(output[0], skip_special_tokens=True).strip()
+        return generated_tweet
+# Gradio Interface Function
+def gradio_interface(pdf_file, context="AI-powered tweet generation"):
+    # Initialize the processor with uploaded PDF path
+    fine_tuned_model_name = 'Manasa1/GPT2_Finetuned_tweets'  # Replace with the path to your fine-tuned model
+    processor = TweetDatasetProcessor(fine_tuned_model_name, pdf_path=pdf_file.name)
+    # Extract text from PDF and process it
+    text = processor.extract_text_from_pdf()
+    tweets = processor.process_pdf_content(text)
+    # Analyze personality based on tweets
+    personality_analysis = processor.analyze_personality(max_tweets=50)
+    # Generate tweet based on the personality analysis and context
+    generated_tweet = processor.generate_tweet(context=context, sample_size=3)
+    return personality_analysis, generated_tweet
+# Gradio app setup
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[
+        gr.File(label="Upload PDF with Tweets"),
+        gr.Textbox(label="Context for Tweet Generation (optional)", placeholder="e.g., AI-powered tweet generation")
+    ],
+    outputs=[
+        gr.Textbox(label="Personality Analysis"),
+        gr.Textbox(label="Generated Tweet")
+    ],
+    live=True,
+    title="AI Personality and Tweet Generation",
+    description="Automatically analyze personality and generate tweets based on a provided PDF of tweets."
+)
+# Launch the app
+if __name__ == "__main__":
+    iface.launch()
+import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import random
+from datetime import datetime
+from PyPDF2 import PdfReader
+import json
+from dotenv import load_dotenv
+load_dotenv()
+class TweetDatasetProcessor:
+    def __init__(self, fine_tuned_model_name, pdf_path=None):
+        self.tweets = []
+        self.personality_profile = {}
+        self.used_tweets = set()  # Track used tweets to avoid repetition
+        self.pdf_path = pdf_path
+        # Load fine-tuned model and tokenizer
+        self.model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name)
+    @staticmethod
+    def _process_line(line):
+        """Process a single line."""
+        line = line.strip()
+        if not line or line.startswith('http'):  # Skip empty lines and URLs
+            return None
+        return {
+            'content': line,
+            'timestamp': datetime.now(),
+            'mentions': [word for word in line.split() if word.startswith('@')],
+            'hashtags': [word for word in line.split() if word.startswith('#')]
+        }
+    def extract_text_from_pdf(self):
+        """Extract text content from PDF file."""
+        if not self.pdf_path:
+            return ""
+        reader = PdfReader(self.pdf_path)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+        return text
+    def process_pdf_content(self, text):
+        """Process PDF content and clean extracted tweets."""
+        if not text.strip():
+            raise ValueError("The provided PDF appears to be empty.")
+        lines = text.split('\n')
+        clean_tweets = [TweetDatasetProcessor._process_line(line) for line in lines]
+        self.tweets = [tweet for tweet in clean_tweets if tweet]
+        if not self.tweets:
+            raise ValueError("No tweets were extracted from the PDF. Ensure the content is properly formatted.")
+        return self.tweets
+    def analyze_personality(self, max_tweets=50):
+        """Comprehensive personality analysis using a limited subset of tweets."""
+        if not self.tweets:
+            raise ValueError("No tweets available for personality analysis.")
+        all_tweets = [tweet['content'] for tweet in self.tweets][:max_tweets]
+        analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets:
+        Core beliefs, emotional tendencies, cognitive patterns, etc.
+        Tweets for analysis:
+        {json.dumps(all_tweets, indent=2)}
+        """
+        input_ids = self.tokenizer.encode(analysis_prompt, return_tensors='pt')
+        output = self.model.generate(input_ids, max_length=500, num_return_sequences=1, temperature=0.7)
+        personality_analysis = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        self.personality_profile = personality_analysis
+        return self.personality_profile
+    def generate_tweet(self, context="", sample_size=3):
+        """Generate a new tweet by sampling random tweets and avoiding repetition."""
+        if not self.tweets:
+            return "Error: No tweets available for generation."
+        # Randomly sample unique tweets
+        available_tweets = [tweet for tweet in self.tweets if tweet['content'] not in self.used_tweets]
+        if len(available_tweets) < sample_size:
+            self.used_tweets.clear()  # Reset used tweets if all have been used
+            available_tweets = self.tweets
+        sampled_tweets = random.sample(available_tweets, sample_size)
+        sampled_contents = [tweet['content'] for tweet in sampled_tweets]
+        # Update the used tweets tracker
+        self.used_tweets.update(sampled_contents)
+        # Truncate personality profile to avoid token overflow
+        personality_profile_excerpt = self.personality_profile[:400] if len(self.personality_profile) > 400 else self.personality_profile
+        # Construct the prompt
+        prompt = f"""Based on this personality profile:
+        {personality_profile_excerpt}
+        Current context or topic (if any):
+        {context}
+        Tweets for context:
+        {', '.join(sampled_contents)}
+        **Only generate the tweet. Do not include analysis, explanation, or any other content.**
+        """
+        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
+        output = self.model.generate(input_ids, max_length=150, num_return_sequences=1, temperature=1.0)
+        generated_tweet = self.tokenizer.decode(output[0], skip_special_tokens=True).strip()
+        return generated_tweet
+# Gradio Interface Function
+def gradio_interface(pdf_file, context="AI-powered tweet generation"):
+    # Initialize the processor with uploaded PDF path
+    fine_tuned_model_name = 'Manasa1/GPT2_Finetuned_tweets'  # Replace with the path to your fine-tuned model
+    pdf_path = 'Dataset (4).pdf'
+    processor = TweetDatasetProcessor(fine_tuned_model_name, pdf_path=pdf_path)
+    # Extract text from PDF and process it
     text = processor.extract_text_from_pdf()
     tweets = processor.process_pdf_content(text)
+    # Analyze personality based on tweets
     personality_analysis = processor.analyze_personality(max_tweets=50)
+    # Generate tweet based on the personality analysis and context
+    generated_tweet = processor.generate_tweet(context=context, sample_size=3)
     return personality_analysis, generated_tweet
 # Gradio app setup
 iface = gr.Interface(
     fn=gradio_interface,
+    inputs=[
+        gr.File(label="Upload PDF with Tweets"),
+        gr.Textbox(label="Context for Tweet Generation (optional)", placeholder="e.g., AI-powered tweet generation")
+    ],
     outputs=[
         gr.Textbox(label="Personality Analysis"),
         gr.Textbox(label="Generated Tweet")