jsemrau commited on
Commit
ed6e021
·
1 Parent(s): e63c60b

split utils and ui

Browse files
Files changed (2) hide show
  1. app.py +77 -92
  2. utils.py +93 -0
app.py CHANGED
@@ -32,7 +32,7 @@ from dotenv import load_dotenv
32
 
33
  # Load environment variables from .env file
34
  load_dotenv()
35
-
36
  news_selector=2
37
 
38
  # Set up logging
@@ -77,6 +77,77 @@ openai_key=os.getenv('OPENAI')
77
  DEFAULT_INTERESTS = os.getenv('INTERESTS', 'cognition, sentience, finance, investing, orchestration')
78
  USE_LOCAL_MODELS = os.getenv('USE_LOCAL_MODELS', 'false').lower() == 'true'
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def check_environment():
81
  """Check if required environment variables are set"""
82
  if not HF_TOKEN:
@@ -203,92 +274,6 @@ def initialize_editor():
203
  except Exception as e:
204
  return f"⌠Error initializing editor: {str(e)}"
205
 
206
- def clean_url(url):
207
- """Clean tracking parameters from URLs"""
208
- url = url.split('&')[0]
209
- url= url.rstrip('/')
210
- # Decode the path to fix encoded '?' or '=' that belong to the path, not query
211
- fixed_url = urllib.parse.unquote(url)
212
-
213
- return fixed_url
214
-
215
- def get_body(url):
216
- """Extract article content from URL"""
217
- body_text = ""
218
- try:
219
- headers = {
220
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
221
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
222
- "Accept-Language": "en-US,en;q=0.5",
223
- "Accept-Encoding": "gzip, deflate",
224
- "Connection": "keep-alive",
225
- "Upgrade-Insecure-Requests": "1",
226
- }
227
-
228
- headers = {
229
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
230
- }
231
-
232
- response = requests.get(url, headers=headers, timeout=10)
233
- response.raise_for_status()
234
-
235
- if url.endswith(".pdf") or "arxiv.org/pdf/" in url:
236
- # PDF content
237
- with BytesIO(response.content) as f:
238
- reader = PdfReader(f)
239
- text_parts = []
240
- for page in reader.pages:
241
- text_parts.append(page.extract_text() or "")
242
- body_text = "\n".join(text_parts)
243
- else:
244
- # HTML content
245
- soup = BeautifulSoup(response.text, "html.parser")
246
- paragraphs = soup.find_all(["p"])
247
- body_text = " ".join([p.get_text(strip=True) for p in paragraphs])
248
-
249
-
250
- except Exception as e:
251
- print(f"Failed to fetch {url}: {e}")
252
-
253
- return body_text
254
-
255
- def ner_tagger(text, model):
256
- """Extract named entities from text"""
257
- labels = ["Source", "Financial Metric", "Date", "Organization", "Person", "Product", "Percentage", "Monetary Value", "Duration"]
258
- entities = model.predict_entities(text, labels, threshold=0.1)
259
- return entities
260
-
261
- def remove_duplicate_relationships(data: str) -> str:
262
- """Remove duplicate relationships from knowledge graph"""
263
- lines = data.splitlines()
264
- triples = []
265
- subject = None
266
-
267
- for line in lines:
268
- parts = [part.strip() for part in line.split("-->")]
269
- if len(parts) != 3:
270
- continue
271
- else:
272
- if len(parts[0]) > 0:
273
- subject = parts[0]
274
- predicate = parts[1]
275
- obj = parts[2]
276
-
277
- triples.append((subject, predicate, obj))
278
-
279
- unique_triples = sorted(set(triples))
280
-
281
- grouped = defaultdict(list)
282
- for subj, pred, obj in unique_triples:
283
- grouped[subj].append(f" -->{pred}--> {obj}")
284
-
285
- output_lines = []
286
- for subj in grouped:
287
- output_lines.append(subj)
288
- output_lines.extend(grouped[subj])
289
-
290
- return '\n'.join(output_lines)
291
-
292
  def edit_single_article(post, edit_prompt):
293
  """Edit a single news article and generate LinkedIn post"""
294
  global editor_agent
@@ -657,8 +642,8 @@ def clear_work_queue():
657
  # Gradio Interface
658
  def create_interface():
659
  """Create the Gradio interface"""
660
- #, theme=gr.themes.Soft()
661
- with gr.Blocks(title="Post Generator") as app:
662
  gr.Markdown("#Post Generator")
663
  gr.Markdown("Generate engaging LinkedIn posts from recent news articles using AI agents and NER analysis.")
664
 
@@ -1010,9 +995,9 @@ if __name__ == "__main__":
1010
 
1011
 
1012
  #Initialize the model
1013
- print("Starting to initialize models")
1014
- initialize_models()
1015
- print("Models have been initialized")
1016
  # Create and launch the app
1017
  app = create_interface()
1018
 
 
32
 
33
  # Load environment variables from .env file
34
  load_dotenv()
35
+ from utils import clean_url, get_body,ner_tagger,remove_duplicate_relationships
36
  news_selector=2
37
 
38
  # Set up logging
 
77
  DEFAULT_INTERESTS = os.getenv('INTERESTS', 'cognition, sentience, finance, investing, orchestration')
78
  USE_LOCAL_MODELS = os.getenv('USE_LOCAL_MODELS', 'false').lower() == 'true'
79
 
80
+ # Check if HF_TOKEN is available
81
+ if not HF_TOKEN:
82
+ print("❌ HuggingFace token not found. Please check your .env file.")
83
+
84
+ try:
85
+ # Login to HuggingFace
86
+ login(HF_TOKEN, add_to_git_credential=False)
87
+
88
+ # Initialize NER model
89
+ print("Initialize NER")
90
+ ner_model = GLiNER.from_pretrained("knowledgator/modern-gliner-bi-large-v1.0")
91
+ print(f"Initialized NER")
92
+
93
+
94
+ llm_engine = InferenceClientModel(
95
+ api_key=HF_TOKEN,
96
+ model_id="Qwen/Qwen3-Coder-480B-A35B-Instruct" ,
97
+ timeout=3000,
98
+ provider="fireworks-ai",
99
+ temperature=0.25
100
+ )
101
+
102
+
103
+ # Initialize agent
104
+ agent = CodeAgent(
105
+ model=llm_engine,
106
+ tools=[],
107
+ add_base_tools=False,
108
+ name="data_agent",
109
+ description="Runs data analysis for you.",
110
+ max_steps=1,
111
+ )
112
+
113
+ # Initialize agent
114
+ writer_agent = CodeAgent(
115
+ model=llm_engine,
116
+ tools=[],
117
+ add_base_tools=False,
118
+ name="writer_agent",
119
+ description="Write an engaging and creative LinkedIn post.",
120
+ max_steps=5,
121
+ )
122
+
123
+ writer_engine = InferenceClientModel(
124
+ api_key=HF_TOKEN,
125
+ model_id="Qwen/Qwen3-Coder-480B-A35B-Instruct" ,
126
+ timeout=3000,
127
+ provider="fireworks-ai",
128
+ temperature=0.4
129
+ )
130
+
131
+
132
+ # Initialize agent
133
+ editor_agent = CodeAgent(
134
+ model=writer_engine,
135
+ tools=[],
136
+ add_base_tools=False,
137
+ name="editor_agent",
138
+ description="Edits LinkedIn post.",
139
+ max_steps=5,
140
+ )
141
+
142
+ # Add system prompt
143
+ #system_prompt = f"You are a strategic digital marketing manager focused on improving my social footprint. My interests are {interests}. You will receive a social media post. Please let me know which one I should react on."
144
+ #agent.prompt_templates["system_prompt"] += system_prompt
145
+
146
+ return "✅ Models initialized successfully!"
147
+
148
+ except Exception as e:
149
+ print( f"⌠Error initializing models: {str(e)}")
150
+
151
  def check_environment():
152
  """Check if required environment variables are set"""
153
  if not HF_TOKEN:
 
274
  except Exception as e:
275
  return f"⌠Error initializing editor: {str(e)}"
276
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  def edit_single_article(post, edit_prompt):
278
  """Edit a single news article and generate LinkedIn post"""
279
  global editor_agent
 
642
  # Gradio Interface
643
  def create_interface():
644
  """Create the Gradio interface"""
645
+
646
+ with gr.Blocks(title="Post Generator", theme=gr.themes.Soft()) as app:
647
  gr.Markdown("#Post Generator")
648
  gr.Markdown("Generate engaging LinkedIn posts from recent news articles using AI agents and NER analysis.")
649
 
 
995
 
996
 
997
  #Initialize the model
998
+ #print("Starting to initialize models")
999
+ #initialize_models()
1000
+ #print("Models have been initialized")
1001
  # Create and launch the app
1002
  app = create_interface()
1003
 
utils.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tldextract import extract
2
+ from urllib.parse import quote_plus
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from io import BytesIO
6
+ from PyPDF2 import PdfReader
7
+ import urllib.parse
8
+
9
+ def clean_url(url):
10
+ """Clean tracking parameters from URLs"""
11
+ url = url.split('&')[0]
12
+ url= url.rstrip('/')
13
+ # Decode the path to fix encoded '?' or '=' that belong to the path, not query
14
+ fixed_url = urllib.parse.unquote(url)
15
+
16
+ return fixed_url
17
+
18
+ def get_body(url):
19
+ """Extract article content from URL"""
20
+ body_text = ""
21
+ try:
22
+ headers = {
23
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
24
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
25
+ "Accept-Language": "en-US,en;q=0.5",
26
+ "Accept-Encoding": "gzip, deflate",
27
+ "Connection": "keep-alive",
28
+ "Upgrade-Insecure-Requests": "1",
29
+ }
30
+
31
+ headers = {
32
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
33
+ }
34
+
35
+ response = requests.get(url, headers=headers, timeout=10)
36
+ response.raise_for_status()
37
+
38
+ if url.endswith(".pdf") or "arxiv.org/pdf/" in url:
39
+ # PDF content
40
+ with BytesIO(response.content) as f:
41
+ reader = PdfReader(f)
42
+ text_parts = []
43
+ for page in reader.pages:
44
+ text_parts.append(page.extract_text() or "")
45
+ body_text = "\n".join(text_parts)
46
+ else:
47
+ # HTML content
48
+ soup = BeautifulSoup(response.text, "html.parser")
49
+ paragraphs = soup.find_all(["p"])
50
+ body_text = " ".join([p.get_text(strip=True) for p in paragraphs])
51
+
52
+
53
+ except Exception as e:
54
+ print(f"Failed to fetch {url}: {e}")
55
+
56
+ return body_text
57
+
58
+ def ner_tagger(text, model):
59
+ """Extract named entities from text"""
60
+ labels = ["Source", "Financial Metric", "Date", "Organization", "Person", "Product", "Percentage", "Monetary Value", "Duration"]
61
+ entities = model.predict_entities(text, labels, threshold=0.1)
62
+ return entities
63
+
64
+ def remove_duplicate_relationships(data: str) -> str:
65
+ """Remove duplicate relationships from knowledge graph"""
66
+ lines = data.splitlines()
67
+ triples = []
68
+ subject = None
69
+
70
+ for line in lines:
71
+ parts = [part.strip() for part in line.split("-->")]
72
+ if len(parts) != 3:
73
+ continue
74
+ else:
75
+ if len(parts[0]) > 0:
76
+ subject = parts[0]
77
+ predicate = parts[1]
78
+ obj = parts[2]
79
+
80
+ triples.append((subject, predicate, obj))
81
+
82
+ unique_triples = sorted(set(triples))
83
+
84
+ grouped = defaultdict(list)
85
+ for subj, pred, obj in unique_triples:
86
+ grouped[subj].append(f" -->{pred}--> {obj}")
87
+
88
+ output_lines = []
89
+ for subj in grouped:
90
+ output_lines.append(subj)
91
+ output_lines.extend(grouped[subj])
92
+
93
+ return '\n'.join(output_lines)