shijisan commited on
Commit
d5ba0c8
·
verified ·
1 Parent(s): 2035bab

updated app.py to clean text input before sending to model

Browse files
Files changed (1) hide show
  1. app.py +12 -1
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  from fastapi import FastAPI
3
  from pydantic import BaseModel
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
@@ -26,10 +27,20 @@ except Exception as e:
26
  class InputText(BaseModel):
27
  text: str
28
 
 
 
 
 
 
 
29
  @app.post("/summarize")
30
  async def summarize(input: InputText):
 
 
 
 
31
  inputs = tokenizer(
32
- input.text,
33
  return_tensors="pt",
34
  max_length=16384,
35
  truncation=True,
 
1
  import os
2
+ import re
3
  from fastapi import FastAPI
4
  from pydantic import BaseModel
5
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
27
  class InputText(BaseModel):
28
  text: str
29
 
30
+ def clean_text(text: str) -> str:
31
+ text = re.sub(r"[\r\n\t]+", " ", text)
32
+ text = re.sub(r"\s{2,}", " ", text)
33
+ text = text.strip()
34
+ return text
35
+
36
  @app.post("/summarize")
37
  async def summarize(input: InputText):
38
+
39
+ cleaned_input = clean_text(input.text)
40
+ prompt = f"summarize: {cleaned_input}"
41
+
42
  inputs = tokenizer(
43
+ prompt,
44
  return_tensors="pt",
45
  max_length=16384,
46
  truncation=True,