mitesh001 commited on
Commit
67f0b1c
·
1 Parent(s): 10c7644

Update more accuracy level

Browse files
Files changed (2) hide show
  1. main.py +173 -53
  2. requirements.txt +2 -1
main.py CHANGED
@@ -4,18 +4,24 @@ from pydantic import BaseModel
4
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification
5
  import dateparser
6
  from datetime import datetime
7
- from langdetect import detect
8
  from textblob import TextBlob
9
  from dateparser.search import search_dates
10
  import uuid
11
  import time
 
 
 
 
12
 
13
  from fastapi.middleware.cors import CORSMiddleware
14
  from fastapi.responses import JSONResponse
 
15
  from fastapi.requests import Request
16
  from fastapi import status
 
17
 
18
- app = FastAPI()
19
  app.add_middleware(
20
  CORSMiddleware,
21
  allow_origins=["*"], # or your domain(s)
@@ -39,42 +45,73 @@ ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_str
39
 
40
  # Labels for classification
41
  labels = [
42
- "task", "event", "reminder", "meeting", "relationship", "note", "journal", "memory", "status_update",
43
- "sick_notice", "out_of_office", "travel_plan", "celebration", "emotion", "news", "information", "other"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  ]
45
 
46
  class TextInput(BaseModel):
47
  text: str
48
 
49
  # Function to extract dates and time mentions based on regex patterns
50
- def extract_dates(text):
51
- time_expressions = re.findall(
52
- r'\b(?:\d{1,2}(?:st|nd|rd|th)?\s+(January|February|March|April|May|June|July|August|September|October|November|December)(?:\s+\d{4})?|\d{1,2}:\d{2}\s?(AM|PM|am|pm)?)\b',
53
- text, flags=re.IGNORECASE)
54
- parsed = [str(dateparser.parse(t)) for t in time_expressions if dateparser.parse(t)]
55
- return list(set(parsed)), list(set(time_expressions))
56
-
57
- # Function to detect tense based on parsed dates
58
- def extract_dates_with_accuracy(text):
59
- settings = {
60
  "PREFER_DATES_FROM": "future", # Bias future
61
  "RELATIVE_BASE": datetime.now(), # Anchor to now
62
  "RETURN_AS_TIMEZONE_AWARE": False, # Use naive datetime
63
- }
64
 
65
- results = search_dates(text, settings=settings)
66
- time_mentions, parsed = [], []
67
 
68
  if results:
69
- for mention, dt in results:
70
- if len(mention.strip()) <= 3:
71
- continue # skip vague/short like "on", "to"
72
- if dt:
73
- # Convert to clean ISO format (e.g. "2025-07-14T11:00:00")
74
- parsed.append(dt.isoformat())
75
- time_mentions.append(mention.strip())
 
 
 
76
 
77
- return list(set(parsed)), list(set(time_mentions))
 
 
 
 
 
 
78
 
79
  def detect_tense(parsed_dates):
80
  now = datetime.now()
@@ -96,19 +133,14 @@ def generate_summary(text):
96
  output_ids = summarizer_model.generate(input_ids, max_length=60, num_beams=4, early_stopping=True)
97
  return summarizer_tokenizer.decode(output_ids[0], skip_special_tokens=True)
98
 
99
-
100
- def extract_people(text):
101
- ner_results = ner_pipeline(text)
102
- return list(set(ent['word'] for ent in ner_results if ent['entity_group'] == 'PER'))
103
-
104
  def estimate_mood(text):
105
  text_lower = text.lower()
106
  mood_map = {
107
- "happy": ["happy", "excited", "joy", "grateful"],
108
- "sad": ["sad", "upset", "crying", "lonely"],
109
- "angry": ["angry", "annoyed", "frustrated", "irritated"],
110
- "nervous": ["nervous", "anxious", "scared"],
111
- "unwell": ["sick", "unwell", "not feeling well", "fever", "cold", "headache"],
112
  "neutral": []
113
  }
114
 
@@ -132,10 +164,11 @@ def generate_tags(label, text):
132
 
133
  # Detect language using langdetect
134
  def detect_language(text):
135
- try:
136
- return detect(text)
137
- except:
138
- return "unknown"
 
139
 
140
  # Detect sentiment using TextBlob
141
  def get_sentiment_score(text):
@@ -239,42 +272,127 @@ def get_meta_info(text: str):
239
  "year": now.year # 0 to 23
240
  }
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  @app.get("/health")
243
  def health_check():
244
  return {"message": "✅ Hello from yourpartner/demospace — API is running!"}
245
 
246
  @app.exception_handler(404)
247
  async def not_found_handler(request: Request, exc):
248
- return JSONResponse(status_code=404, content={"error": "Route not found"})
249
 
250
  @app.exception_handler(500)
251
  async def internal_error_handler(request: Request, exc):
252
- return JSONResponse(status_code=500, content={"error": "Internal server error"})
253
 
254
- @app.post("/analyze")
255
  async def analyze(input: TextInput):
256
  start_time = time.time() # ⏱️ start
257
 
258
  text = input.text
259
 
260
- classification = classifier(text, labels)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  best_label = classification['labels'][0]
262
 
 
 
263
  if "reported" in text or "announced" in text or "collapsed" in text:
264
  if best_label in ["task", "reminder", "event"]:
265
  best_label = "news"
266
 
267
  scores = dict(zip(classification['labels'], classification['scores']))
268
-
269
- parsed_dates, time_mentions = extract_dates_with_accuracy(text)
270
- tenses = detect_tense(parsed_dates)
271
- summary = generate_summary(text).removeprefix("summary:").strip()
272
- people = extract_people(text)
 
 
 
 
 
273
  mood = estimate_mood(text)
274
  tags = generate_tags(best_label, text)
275
  language_detected = detect_language(text)
276
- sentiment_score = get_sentiment_score(text)
277
- entities = extract_entities(text)
 
278
  intent = infer_intent(best_label, text)
279
  urgency_score = get_urgency_score(text, parsed_dates)
280
 
@@ -289,7 +407,7 @@ async def analyze(input: TextInput):
289
  end_time = time.time() # ⏱️ end
290
  processing_time_ms = round((end_time - start_time) * 1000)
291
 
292
- return {
293
  "uuid": str(uuid.uuid4()), # Unique identifier for the request
294
  "raw_text": text,
295
  "word_count": meta["word_count"],
@@ -299,12 +417,12 @@ async def analyze(input: TextInput):
299
  "year": meta["year"],
300
  "type": best_label,
301
  "intent": intent,
302
- "confidence_scores": scores,
303
  "urgency_score": urgency_score,
304
  "time_mentions": time_mentions,
305
  "parsed_dates": parsed_dates,
306
  "tense": tenses,
307
- "summary": summary,
308
  "people": people,
309
  "mood": mood,
310
  "language": language_detected,
@@ -312,6 +430,8 @@ async def analyze(input: TextInput):
312
  "tags": tags,
313
  "action_required": action_required,
314
  "entities": entities,
 
315
  "processing_time_ms": processing_time_ms
316
  }
 
317
 
 
4
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification
5
  import dateparser
6
  from datetime import datetime
7
+ from langdetect import detect_langs
8
  from textblob import TextBlob
9
  from dateparser.search import search_dates
10
  import uuid
11
  import time
12
+ import warnings
13
+ warnings.filterwarnings("ignore", category=FutureWarning)
14
+ warnings.filterwarnings("ignore", category=UserWarning)
15
+
16
 
17
  from fastapi.middleware.cors import CORSMiddleware
18
  from fastapi.responses import JSONResponse
19
+ from fastapi.responses import ORJSONResponse
20
  from fastapi.requests import Request
21
  from fastapi import status
22
+ import asyncio
23
 
24
+ app = FastAPI(default_response_class=ORJSONResponse)
25
  app.add_middleware(
26
  CORSMiddleware,
27
  allow_origins=["*"], # or your domain(s)
 
45
 
46
  # Labels for classification
47
  labels = [
48
+ "task (something to be done or completed)",
49
+ "event (an activity that is happening or has happened)",
50
+ "reminder (a message to remember something in the future)",
51
+ "meeting (a planned gathering between people to discuss something)",
52
+ "relationship (message about personal or emotional connection with someone)",
53
+ "note (general note or quick thought not related to any specific category)",
54
+ "journal (personal reflection or emotional writing about one's day or thoughts)",
55
+ "memory (recollection or recording of a past moment or experience)",
56
+ "status_update (current condition, feeling, or situation being shared)",
57
+ "sick_notice (informing about illness or not feeling well)",
58
+ "out_of_office (message about being unavailable for work or responsibilities)",
59
+ "travel_plan (planning or mentioning a trip or journey)",
60
+ "celebration (message about a festive occasion, party or achievement)",
61
+ "expense (money spent on something, either small or large)",
62
+ "news (update about public events, announcements, or current affairs)",
63
+ "information (factual content or informative message not tied to user activity)",
64
+ "purchase (buying or ordering something, like a product or service)",
65
+ "other (does not clearly fall into any specific category)"
66
+ ]
67
+
68
+ expense_keywords = [
69
+ "paid", "bought", "purchased", "ordered", "spent", "payment",
70
+ "recharged", "booked", "transaction", "debit", "renewed",
71
+ "credit card", "cash", "amount", "transfer", "EMI", "wallet",
72
+ "petrol", "bill", "invoice"
73
  ]
74
 
75
  class TextInput(BaseModel):
76
  text: str
77
 
78
  # Function to extract dates and time mentions based on regex patterns
79
+ def extract_dates_with_accuracy(text: str, amounts: list):
80
+ # Get list of numeric values from amount extraction to exclude
81
+ amount_values = {str(int(a["value"])) for a in amounts if isinstance(a["value"], (int, float))}
82
+
83
+ # Use dateparser with relaxed rules
84
+ import dateparser
85
+ from dateparser.search import search_dates
86
+
87
+ results = search_dates(text, settings = {
 
88
  "PREFER_DATES_FROM": "future", # Bias future
89
  "RELATIVE_BASE": datetime.now(), # Anchor to now
90
  "RETURN_AS_TIMEZONE_AWARE": False, # Use naive datetime
91
+ })
92
 
93
+ time_mentions = []
94
+ parsed_dates = []
95
 
96
  if results:
97
+ for phrase, date in results:
98
+ clean_phrase = phrase.strip().lower()
99
+
100
+ # Filter out false positives like '1200'
101
+ if clean_phrase in amount_values:
102
+ continue
103
+
104
+ # Ignore common noise phrases that are not actual dates
105
+ if clean_phrase in {"on", "at", "in", "by", "to", "of"}:
106
+ continue
107
 
108
+ # Optionally: skip pure numbers or short numerics
109
+ if re.fullmatch(r"\d{3,4}", clean_phrase):
110
+ continue
111
+ time_mentions.append(clean_phrase)
112
+ parsed_dates.append(date.isoformat())
113
+
114
+ return time_mentions, parsed_dates
115
 
116
  def detect_tense(parsed_dates):
117
  now = datetime.now()
 
133
  output_ids = summarizer_model.generate(input_ids, max_length=60, num_beams=4, early_stopping=True)
134
  return summarizer_tokenizer.decode(output_ids[0], skip_special_tokens=True)
135
 
 
 
 
 
 
136
  def estimate_mood(text):
137
  text_lower = text.lower()
138
  mood_map = {
139
+ "happy": ["happy", "excited", "joy", "grateful", "glad", "pleased", "content", "satisfied", "cheerful", "elated", "joyful", "optimistic", "hopeful", "proud", "relieved", "enthusiastic"],
140
+ "sad": ["sad", "upset", "crying", "lonely", "depressed", "down", "disappointed", "heartbroken", "unhappy", "dismayed", "discouraged", "disheartened"],
141
+ "angry": ["angry", "annoyed", "frustrated", "irritated", "mad", "furious", "enraged", "livid", "outraged", "infuriated", "exasperated", "indignant", "resentful", "incensed", "fuming", "seething"],
142
+ "nervous": ["nervous", "anxious", "scared", "worried", "fearful", "uneasy", "apprehensive", "tense", "jittery", "restless", "on edge", "panicky", "fidgety", "edgy", "stressed"],
143
+ "unwell": ["sick", "unwell", "not feeling well", "fever", "cold", "headache", "flu", "ill", "nauseous", "dizzy", "tired", "exhausted", "fatigued", "weak", "pain", "ache", "vomit", "cough", "sneeze", "chills", "shivers", "congestion", "runny nose", "coughing", "sore throat"],
144
  "neutral": []
145
  }
146
 
 
164
 
165
  # Detect language using langdetect
166
  def detect_language(text):
167
+ langs = detect_langs(text) # returns list like: [en:0.99, hi:0.01]
168
+ if langs:
169
+ top_lang = langs[0]
170
+ return {"lang": top_lang.lang, "prob": round(top_lang.prob, 6)}
171
+ return {"lang": "unknown", "prob": 0}
172
 
173
  # Detect sentiment using TextBlob
174
  def get_sentiment_score(text):
 
272
  "year": now.year # 0 to 23
273
  }
274
 
275
+ # Function to extract amounts in various currencies from text
276
+ def extract_amounts(text: str):
277
+ currency_patterns = [
278
+ # Symbol or standard currency
279
+ (r"(₹|Rs\.?|INR)\s?(\d{1,3}(?:,\d{3})*(?:\.\d+)?|\d+)", "INR"),
280
+ (r"(\$)\s?(\d+(?:,\d{3})*(?:\.\d+)?)", "USD"),
281
+ (r"(\d+(?:,\d{3})*(?:\.\d+)?)\s?(\$)", "USD"),
282
+ (r"(€|EUR)\s?(\d{1,3}(?:,\d{3})*(?:\.\d+)?|\d+)", "EUR"),
283
+ (r"(\d+(?:,\d{3})*(?:\.\d+)?)\s?(€)", "EUR"),
284
+
285
+ # Word-based currency formats
286
+ (r"(\d+(?:\.\d+)?)\s?(rupees?)", "INR"),
287
+ (r"(\d+(?:\.\d+)?)\s?(dollars?)", "USD"),
288
+ (r"(\d+(?:\.\d+)?)\s?(euros?)", "EUR"),
289
+ (r"(\d+(?:\.\d+)?)\s?(cents?)", "USD"),
290
+
291
+ # Indian number system
292
+ (r"(\d+(?:\.\d+)?)\s?(lacs?|lakhs?)", "INR"),
293
+ (r"(\d+(?:\.\d+)?)\s?(crores?|cr)", "INR"),
294
+ ]
295
+
296
+ results = []
297
+ seen = set()
298
+
299
+ for pattern, currency_code in currency_patterns:
300
+ for match in re.finditer(pattern, text.lower()):
301
+ groups = match.groups()
302
+ number = None
303
+
304
+ if any(word in groups for word in ['lakh', 'lacs', 'lakhs']):
305
+ number = float(groups[0]) * 100000
306
+ elif any(word in groups for word in ['crore', 'crores', 'cr']):
307
+ number = float(groups[0]) * 10000000
308
+ elif 'cents' in groups:
309
+ number = float(groups[0]) / 100
310
+ elif any(word in groups for word in ['rupees', 'dollars', 'euros']):
311
+ number = float(groups[0])
312
+ else:
313
+ try:
314
+ number = float(groups[1].replace(",", ""))
315
+ except (ValueError, IndexError):
316
+ continue
317
+
318
+ if number:
319
+ key = (number, currency_code)
320
+ if key not in seen:
321
+ seen.add(key)
322
+ results.append({
323
+ "value": round(number, 2),
324
+ "currency": currency_code
325
+ })
326
+
327
+ return results
328
+
329
  @app.get("/health")
330
  def health_check():
331
  return {"message": "✅ Hello from yourpartner/demospace — API is running!"}
332
 
333
  @app.exception_handler(404)
334
  async def not_found_handler(request: Request, exc):
335
+ return ORJSONResponse(status_code=404, content={"error": "Route not found"})
336
 
337
  @app.exception_handler(500)
338
  async def internal_error_handler(request: Request, exc):
339
+ return ORJSONResponse(status_code=500, content={"error": "Internal server error"})
340
 
341
+ @app.post("/analyze", response_class=ORJSONResponse)
342
  async def analyze(input: TextInput):
343
  start_time = time.time() # ⏱️ start
344
 
345
  text = input.text
346
 
347
+ label_map = {
348
+ "task (something to be done or completed)": "task",
349
+ "event (an activity that is happening or has happened)": "event",
350
+ "reminder (a message to remember something in the future)": "reminder",
351
+ "meeting (a planned gathering between people to discuss something)": "meeting",
352
+ "relationship (message about personal or emotional connection with someone)": "relationship",
353
+ "note (general note or quick thought not related to any specific category)": "note",
354
+ "journal (personal reflection or emotional writing about one's day or thoughts)": "journal",
355
+ "memory (recollection or recording of a past moment or experience)": "memory",
356
+ "status_update (current condition, feeling, or situation being shared)": "status_update",
357
+ "sick_notice (informing about illness or not feeling well)": "sick_notice",
358
+ "out_of_office (message about being unavailable for work or responsibilities)": "out_of_office",
359
+ "travel_plan (planning or mentioning a trip or journey)": "travel_plan",
360
+ "celebration (message about a festive occasion, party or achievement)": "celebration",
361
+ "expense (money spent on something, either small or large)": "expense",
362
+ "news (update about public events, announcements, or current affairs)": "news",
363
+ "information (factual content or informative message not tied to user activity)": "information",
364
+ "purchase (buying or ordering something, like a product or service)": "purchase",
365
+ "other (does not clearly fall into any specific category)": "other"
366
+ }
367
+
368
+ # classification = classifier(text, labels)
369
+ # Async call to classifier
370
+ classification = await asyncio.to_thread(classifier, text, labels)
371
  best_label = classification['labels'][0]
372
 
373
+ best_label = label_map.get(best_label, best_label)
374
+
375
  if "reported" in text or "announced" in text or "collapsed" in text:
376
  if best_label in ["task", "reminder", "event"]:
377
  best_label = "news"
378
 
379
  scores = dict(zip(classification['labels'], classification['scores']))
380
+ # # Convert to short labels
381
+ confidence_scores = {
382
+ label_map.get(label, label): score
383
+ for label, score in scores.items()
384
+ }
385
+
386
+ amounts = await asyncio.to_thread(extract_amounts, text)
387
+ parsed_dates, time_mentions = await asyncio.to_thread(extract_dates_with_accuracy, text, amounts)
388
+ tenses = detect_tense(parsed_dates)
389
+ summary = await asyncio.to_thread(generate_summary, text)
390
  mood = estimate_mood(text)
391
  tags = generate_tags(best_label, text)
392
  language_detected = detect_language(text)
393
+ sentiment_score = get_sentiment_score(text)
394
+ entities = await asyncio.to_thread(extract_entities, text)
395
+ people = entities["people"] # Extracted people entities
396
  intent = infer_intent(best_label, text)
397
  urgency_score = get_urgency_score(text, parsed_dates)
398
 
 
407
  end_time = time.time() # ⏱️ end
408
  processing_time_ms = round((end_time - start_time) * 1000)
409
 
410
+ result = {
411
  "uuid": str(uuid.uuid4()), # Unique identifier for the request
412
  "raw_text": text,
413
  "word_count": meta["word_count"],
 
417
  "year": meta["year"],
418
  "type": best_label,
419
  "intent": intent,
420
+ "confidence_scores": confidence_scores,
421
  "urgency_score": urgency_score,
422
  "time_mentions": time_mentions,
423
  "parsed_dates": parsed_dates,
424
  "tense": tenses,
425
+ "summary": summary.removeprefix("summary:").strip(),
426
  "people": people,
427
  "mood": mood,
428
  "language": language_detected,
 
430
  "tags": tags,
431
  "action_required": action_required,
432
  "entities": entities,
433
+ "amounts": amounts,
434
  "processing_time_ms": processing_time_ms
435
  }
436
+ return ORJSONResponse(content=result)
437
 
requirements.txt CHANGED
@@ -8,4 +8,5 @@ langdetect
8
  textblob
9
  sentencepiece
10
  protobuf
11
- scikit-learn
 
 
8
  textblob
9
  sentencepiece
10
  protobuf
11
+ scikit-learn
12
+ orjson