GilbertoEwaldFilho commited on
Commit
aea6f8b
·
verified ·
1 Parent(s): 2a371ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -15
app.py CHANGED
@@ -16,16 +16,41 @@ from huggingface_hub import InferenceClient
16
 
17
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
18
 
 
19
  # ================================
20
  # FUNÇÕES AUXILIARES
21
  # ================================
22
 
23
  def clean_answer(text: str) -> str:
 
 
 
 
 
 
 
 
 
24
  if not text:
25
  return ""
26
 
27
  text = str(text).strip()
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  patterns_to_remove = [
30
  r"(?i)^final answer[:\- ]*",
31
  r"(?i)^answer[:\- ]*",
@@ -35,40 +60,71 @@ def clean_answer(text: str) -> str:
35
  for p in patterns_to_remove:
36
  text = re.sub(p, "", text).strip()
37
 
38
- text = text.replace("\n", " ").replace("\r", " ").strip()
39
- text = re.sub(r"\s+", " ", text).strip()
 
 
 
40
 
41
- if len(text) > 2 and text.startswith(("'", '"')) and text.endswith(("'", '"')):
42
- text = text[1:-1]
43
 
 
44
  if text.endswith(".") and not re.search(r"[0-9A-Za-z][.!?]$", text[:-1]):
45
- text = text[:-1]
46
 
47
- return text.strip()
48
 
49
 
50
  def enforce_numeric_format(question: str, answer: str) -> str:
 
 
 
 
 
 
51
  q = question.lower()
 
52
 
 
53
  if "two decimal places" in q or "2 decimal places" in q:
54
- match = re.search(r"[-+]?\d+(?:[.,]\d+)?", answer)
55
  if match:
56
  try:
57
  value = float(match.group(0).replace(",", ""))
58
  return f"{value:.2f}"
59
- except:
60
  pass
61
 
 
62
  if any(kw in q for kw in ["how many", "number of", "what year", "in which year"]):
63
- match = re.search(r"-?\d+", answer.replace(",", ""))
64
  if match:
65
  return match.group(0)
66
 
67
- return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
 
70
  def web_search(question: str, max_results: int = 5) -> str:
71
- snippets = []
 
 
 
72
  try:
73
  with DDGS() as ddgs:
74
  for r in ddgs.text(question, max_results=max_results, safesearch="moderate"):
@@ -87,6 +143,9 @@ def web_search(question: str, max_results: int = 5) -> str:
87
 
88
 
89
  def get_file_context(api_url: str, task_id: str, item: dict) -> str:
 
 
 
90
  file_name = (
91
  item.get("file_name")
92
  or item.get("filename")
@@ -114,7 +173,7 @@ def get_file_context(api_url: str, task_id: str, item: dict) -> str:
114
  if any(name_lower.endswith(ext) for ext in [".txt", ".csv", ".tsv"]):
115
  try:
116
  text = data.decode("utf-8", errors="replace")
117
- except:
118
  text = data.decode("latin-1", errors="replace")
119
  return f"[FILE TXT]\n{text[:8000]}"
120
 
@@ -128,7 +187,8 @@ def get_file_context(api_url: str, task_id: str, item: dict) -> str:
128
  print("[EXCEL PARSE ERROR]", e)
129
  return "[FILE] Spreadsheet exists but cannot parse."
130
 
131
- return f"[FILE BINARY: {file_name}] {len(data)} bytes"
 
132
 
133
  except Exception as e:
134
  print("[FILE ERROR]", e)
@@ -254,7 +314,13 @@ def run_and_submit_all(profile: Optional[gr.OAuthProfile]):
254
  answer = agent(qtext, file_context)
255
 
256
  answers_payload.append({"task_id": qid, "submitted_answer": answer})
257
- results_log.append({"Task ID": qid, "Question": qtext, "Submitted Answer": answer})
 
 
 
 
 
 
258
 
259
  submission = {
260
  "username": username,
@@ -298,4 +364,4 @@ with gr.Blocks() as demo:
298
 
299
 
300
  if __name__ == "__main__":
301
- demo.launch(debug=True, share=False)
 
16
 
17
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
18
 
19
+
20
  # ================================
21
  # FUNÇÕES AUXILIARES
22
  # ================================
23
 
24
  def clean_answer(text: str) -> str:
25
+ """
26
+ Limpa a resposta do modelo para bater em EXACT MATCH:
27
+ - remove blocos <think>...</think> (Qwen Thinking)
28
+ - remove tags <think> soltas
29
+ - remove tags HTML genéricas
30
+ - remove prefixos tipo 'Final answer', 'Answer:'
31
+ - remove aspas externas
32
+ - normaliza espaços e ponto final solto
33
+ """
34
  if not text:
35
  return ""
36
 
37
  text = str(text).strip()
38
 
39
+ # Remover blocos <think>...</think>
40
+ text = re.sub(
41
+ r"<think>.*?</think>",
42
+ "",
43
+ text,
44
+ flags=re.DOTALL | re.IGNORECASE
45
+ ).strip()
46
+
47
+ # Remover tags <think> / </think> soltas
48
+ text = re.sub(r"</?think>", "", text, flags=re.IGNORECASE).strip()
49
+
50
+ # Remover qualquer tag HTML genérica
51
+ text = re.sub(r"<[^>]+>", "", text).strip()
52
+
53
+ # Remover prefixos do tipo "Final answer", "Answer:", etc.
54
  patterns_to_remove = [
55
  r"(?i)^final answer[:\- ]*",
56
  r"(?i)^answer[:\- ]*",
 
60
  for p in patterns_to_remove:
61
  text = re.sub(p, "", text).strip()
62
 
63
+ # Remover aspas externas
64
+ if len(text) > 2 and text.startswith('"') and text.endswith('"'):
65
+ text = text[1:-1].strip()
66
+ if len(text) > 2 and text.startswith("'") and text.endswith("'"):
67
+ text = text[1:-1].strip()
68
 
69
+ # Normalizar espaços
70
+ text = re.sub(r"\s+", " ", text).strip()
71
 
72
+ # Tirar ponto final solto
73
  if text.endswith(".") and not re.search(r"[0-9A-Za-z][.!?]$", text[:-1]):
74
+ text = text[:-1].strip()
75
 
76
+ return text
77
 
78
 
79
  def enforce_numeric_format(question: str, answer: str) -> str:
80
+ """
81
+ Pós-processa a resposta para:
82
+ - garantir duas casas decimais quando pedido
83
+ - extrair inteiros quando a pergunta é "how many / number of / what year"
84
+ - extrair códigos (NASA award, IOC code, etc.) quando a pergunta pede isso
85
+ """
86
  q = question.lower()
87
+ a = answer
88
 
89
+ # 1) Valores com duas casas decimais (ex: USD)
90
  if "two decimal places" in q or "2 decimal places" in q:
91
+ match = re.search(r"[-+]?\d+(?:[.,]\d+)?", a)
92
  if match:
93
  try:
94
  value = float(match.group(0).replace(",", ""))
95
  return f"{value:.2f}"
96
+ except Exception:
97
  pass
98
 
99
+ # 2) Perguntas tipo "how many", "number of", "what year", "in which year"
100
  if any(kw in q for kw in ["how many", "number of", "what year", "in which year"]):
101
+ match = re.search(r"-?\d+", a.replace(",", ""))
102
  if match:
103
  return match.group(0)
104
 
105
+ # 3) Códigos tipo "IOC country code", "award number", "NASA award"
106
+ if (
107
+ "ioc country code" in q
108
+ or "award number" in q
109
+ or "nasa award" in q
110
+ or "grant number" in q
111
+ or "award no." in q
112
+ ):
113
+ # Procura tokens alfanuméricos em MAIÚSCULAS (3+ chars)
114
+ tokens = re.findall(r"[A-Z0-9]{3,}", a)
115
+ if tokens:
116
+ # Heurística simples: pega o token mais longo
117
+ best = max(tokens, key=len)
118
+ return best
119
+
120
+ return a
121
 
122
 
123
  def web_search(question: str, max_results: int = 5) -> str:
124
+ """
125
+ Usa DuckDuckGo (ddgs) pra buscar snippets de contexto.
126
+ """
127
+ snippets: List[str] = []
128
  try:
129
  with DDGS() as ddgs:
130
  for r in ddgs.text(question, max_results=max_results, safesearch="moderate"):
 
143
 
144
 
145
  def get_file_context(api_url: str, task_id: str, item: dict) -> str:
146
+ """
147
+ Tenta baixar o arquivo de /files/{task_id} e extrair texto/planilha.
148
+ """
149
  file_name = (
150
  item.get("file_name")
151
  or item.get("filename")
 
173
  if any(name_lower.endswith(ext) for ext in [".txt", ".csv", ".tsv"]):
174
  try:
175
  text = data.decode("utf-8", errors="replace")
176
+ except Exception:
177
  text = data.decode("latin-1", errors="replace")
178
  return f"[FILE TXT]\n{text[:8000]}"
179
 
 
187
  print("[EXCEL PARSE ERROR]", e)
188
  return "[FILE] Spreadsheet exists but cannot parse."
189
 
190
+ # Outros tipos
191
+ return f"[FILE BINARY: {file_name}] {len(data)} bytes (type: {content_type})"
192
 
193
  except Exception as e:
194
  print("[FILE ERROR]", e)
 
314
  answer = agent(qtext, file_context)
315
 
316
  answers_payload.append({"task_id": qid, "submitted_answer": answer})
317
+ results_log.append(
318
+ {
319
+ "Task ID": qid,
320
+ "Question": qtext,
321
+ "Submitted Answer": answer,
322
+ }
323
+ )
324
 
325
  submission = {
326
  "username": username,
 
364
 
365
 
366
  if __name__ == "__main__":
367
+ demo.launch(debug=True, share=False)