Zevir commited on
Commit
4d16182
·
0 Parent(s):
app/__init__.py ADDED
File without changes
app/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (155 Bytes). View file
 
app/__pycache__/preprocess.cpython-312.pyc ADDED
Binary file (774 Bytes). View file
 
app/__pycache__/train_finetune.cpython-312.pyc ADDED
Binary file (6.42 kB). View file
 
app/bert_classifier.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+
4
+ MODEL_DIR = "app/model"
5
+ device = "cuda" if torch.cuda.is_available() else "cpu"
6
+
7
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
8
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
9
+ model.eval()
10
+
11
+
12
+ def classify_text(text: str):
13
+ """
14
+ Retorna:
15
+ pred: 0 (fake) ou 1 (real)
16
+ confidence: probabilidade máxima, já no formato que sua API usa
17
+ """
18
+ encoded = tokenizer(
19
+ text,
20
+ truncation=True,
21
+ padding=True,
22
+ max_length=256,
23
+ return_tensors="pt"
24
+ ).to(device)
25
+
26
+ with torch.no_grad():
27
+ out = model(**encoded)
28
+ logits = out.logits
29
+ probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
30
+
31
+ pred = int(probs.argmax())
32
+ confidence = float(probs.max())
33
+
34
+ return pred, confidence
app/main.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+
5
+ from app.bert_classifier import classify_text
6
+ from app.ocr import extract_text_from_image
7
+ from app.preprocess import preprocess_text
8
+
9
+
10
+ app = FastAPI(
11
+ title="API Fake News — BERT Fine-Tuned",
12
+ version="2.0.0"
13
+ )
14
+
15
+ # CORS
16
+ app.add_middleware(
17
+ CORSMiddleware,
18
+ allow_origins=["*"],
19
+ allow_methods=["*"],
20
+ allow_headers=["*"],
21
+ )
22
+
23
+ # Modelos de entrada/saída
24
+ class TextInput(BaseModel):
25
+ text: str
26
+
27
+
28
+ @app.get("/")
29
+ def root():
30
+ return {"message": "API Fake News — BERT Fine-Tuned ativa!"}
31
+
32
+ # OCR
33
+ @app.post("/img-to-txt")
34
+ async def img_to_txt(file: UploadFile = File(...)):
35
+ bytes_data = await file.read()
36
+ text = extract_text_from_image(bytes_data)
37
+ return {"text": text}
38
+
39
+ # Predição
40
+ @app.post("/predict")
41
+ async def predict_text(input: TextInput):
42
+ text = preprocess_text(input.text)
43
+
44
+ if len(text) < 20:
45
+ return {
46
+ "prediction": "Indefinido",
47
+ "confidence": 0.0,
48
+ "message": "O texto é muito curto para análise confiável."
49
+ }
50
+
51
+ pred, conf = classify_text(text)
52
+
53
+ if pred == 1:
54
+ label = "Notícia Real"
55
+
56
+ if conf > 0.90:
57
+ message = "Essa notícia parece altamente confiável."
58
+ elif conf > 0.75:
59
+ message = "Provável notícia real, mas é bom conferir as fontes."
60
+ else:
61
+ message = "O modelo pende para real, mas com baixa confiança."
62
+ else:
63
+ label = "Fake News"
64
+
65
+ if conf > 0.90:
66
+ message = "Forte indicação de que esta notícia é falsa."
67
+ elif conf > 0.75:
68
+ message = "Provável conteúdo falso, mas recomenda-se verificar fontes."
69
+ else:
70
+ message = "O modelo pende para falsa, mas sem alta confiança."
71
+
72
+ return {
73
+ "prediction": label,
74
+ "confidence": round(conf, 3),
75
+ "message": message
76
+ }
app/model/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "directionality": "bidi",
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "output_past": true,
19
+ "pad_token_id": 0,
20
+ "pooler_fc_size": 768,
21
+ "pooler_num_attention_heads": 12,
22
+ "pooler_num_fc_layers": 3,
23
+ "pooler_size_per_head": 128,
24
+ "pooler_type": "first_token_transform",
25
+ "position_embedding_type": "absolute",
26
+ "problem_type": "single_label_classification",
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.55.4",
29
+ "type_vocab_size": 2,
30
+ "use_cache": true,
31
+ "vocab_size": 29794
32
+ }
app/model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27642721c62ef81f6961c3051c3246035166f90d888b853f4e91e2ca8ae3c460
3
+ size 435722224
app/model/pytorch_model.bin ADDED
File without changes
app/model/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6d346be366a7d1d48332dbc9fdf3bf8960b5d879522b7799ddba59e76237ee3
3
+ size 125
app/model/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b22b95acf8d863293658d68a3996f22ee077bc792415c976e632049e1e399466
3
+ size 678055
app/model/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ab044e4a71cdb2a5cff548e16d3bcd46a757848ef861743426c44a134b00da1
3
+ size 1301
app/model/vocab.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69c28584c67a0e5018f85ca734aa272cc38e26b5dd0d33fffa28059299f21707
3
+ size 209528
app/ocr.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import pytesseract
2
+ from PIL import Image
3
+ from io import BytesIO
4
+
5
+ def extract_text_from_image(image_bytes):
6
+ img = Image.open(BytesIO(image_bytes))
7
+ return pytesseract.image_to_string(img, lang="por")
app/preprocess.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+
4
+ def preprocess_text(text):
5
+ text = unicodedata.normalize("NFKC", text)
6
+ text = re.sub(r"http\S+|www\.\S+", "", text)
7
+ text = re.sub(r"<.*?>", "", text)
8
+ text = re.sub(r"[^\wÀ-ÖØ-öø-ÿ?!,. ]", " ", text)
9
+ text = re.sub(r"\s+", " ", text).strip()
10
+ return text
11
+
app/train_finetune.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from torch.utils.data import DataLoader, Dataset
4
+ from torch.optim import AdamW
5
+ from transformers import (
6
+ AutoTokenizer,
7
+ AutoModelForSequenceClassification,
8
+ get_linear_schedule_with_warmup
9
+ )
10
+ from tqdm import tqdm
11
+ import random
12
+
13
+ from app.preprocess import preprocess_text
14
+
15
+ # CONFIGURAÇÕES
16
+ MODEL_NAME = "neuralmind/bert-base-portuguese-cased"
17
+ OUTPUT_DIR = "app/model"
18
+ FAKE_DIR = "data/fake_news/financeiros"
19
+ REAL_DIR = "data/real_news/financeiros"
20
+
21
+ EPOCHS = 3
22
+ BATCH_SIZE = 8
23
+ LR = 2e-5
24
+ MAX_LEN = 256
25
+
26
+ device = "cuda" if torch.cuda.is_available() else "cpu"
27
+ print(f"🔥 Treinando em: {device}")
28
+
29
+ # FUNÇÕES AUXILIARES
30
+ def load_texts_from_dir(directory, label):
31
+ """Lê recursivamente todos os .txt em todas as subpastas."""
32
+ samples = []
33
+
34
+ for root, _, files in os.walk(directory):
35
+ for fname in files:
36
+ if fname.endswith(".txt"):
37
+ path = os.path.join(root, fname)
38
+ try:
39
+ with open(path, "r", encoding="utf-8") as f:
40
+ text = f.read()
41
+ text = preprocess_text(text)
42
+ samples.append((text, label))
43
+ except Exception as e:
44
+ print(f"⚠ Erro ao ler {path}: {e}")
45
+
46
+ return samples
47
+
48
+
49
+ def load_dataset():
50
+ """Carrega fake e real em formato único."""
51
+ print("📂 Carregando dados das pastas...")
52
+ fake = load_texts_from_dir(FAKE_DIR, 0)
53
+ real = load_texts_from_dir(REAL_DIR, 1)
54
+
55
+ dataset = fake + real
56
+ random.shuffle(dataset)
57
+
58
+ print(f"✔ Total Fake: {len(fake)}")
59
+ print(f"✔ Total Real: {len(real)}")
60
+ print(f"✔ Total: {len(dataset)}")
61
+
62
+ texts, labels = zip(*dataset)
63
+ return list(texts), list(labels)
64
+
65
+ # DATASET DO TORCH
66
+ class NewsDataset(Dataset):
67
+ def __init__(self, texts, labels, tokenizer):
68
+ self.texts = texts
69
+ self.labels = labels
70
+ self.tokenizer = tokenizer
71
+
72
+ def __len__(self):
73
+ return len(self.texts)
74
+
75
+ def __getitem__(self, idx):
76
+ encoded = self.tokenizer(
77
+ self.texts[idx],
78
+ truncation=True,
79
+ padding="max_length",
80
+ max_length=MAX_LEN,
81
+ return_tensors="pt"
82
+ )
83
+ encoded = {k: v.squeeze() for k, v in encoded.items()}
84
+ encoded["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
85
+ return encoded
86
+
87
+ # PROCESSO DE TREINAMENTO
88
+ def train():
89
+ texts, labels = load_dataset()
90
+
91
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
92
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)
93
+
94
+ dataset = NewsDataset(texts, labels, tokenizer)
95
+ loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
96
+
97
+ optimizer = AdamW(model.parameters(), lr=LR)
98
+ total_steps = len(loader) * EPOCHS
99
+ scheduler = get_linear_schedule_with_warmup(optimizer, 0, total_steps)
100
+
101
+ print("\n🚀 Iniciando fine-tuning do BERT...\n")
102
+
103
+ model.train()
104
+
105
+ for epoch in range(EPOCHS):
106
+ print(f"\n===== Época {epoch+1}/{EPOCHS} =====")
107
+ epoch_loss = 0
108
+
109
+ for batch in tqdm(loader):
110
+ batch = {k: v.to(device) for k, v in batch.items()}
111
+
112
+ outputs = model(**batch)
113
+ loss = outputs.loss
114
+ epoch_loss += loss.item()
115
+
116
+ loss.backward()
117
+ optimizer.step()
118
+ scheduler.step()
119
+ optimizer.zero_grad()
120
+
121
+ print(f"📉 Loss da época: {epoch_loss / len(loader):.4f}")
122
+
123
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
124
+ model.save_pretrained(OUTPUT_DIR)
125
+ tokenizer.save_pretrained(OUTPUT_DIR)
126
+
127
+ print(f"\n🎉 Modelo salvo em: {OUTPUT_DIR}\n")
128
+
129
+
130
+ if __name__ == "__main__":
131
+ train()
dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ RUN apt-get update && apt-get install -y \
4
+ tesseract-ocr \
5
+ tesseract-ocr-por \
6
+ libgl1 \
7
+ && apt-get clean
8
+
9
+ WORKDIR /app
10
+
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ COPY . .
15
+
16
+ EXPOSE 7860
17
+
18
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pytesseract
4
+ pillow
5
+ python-multipart
6
+ transformers
7
+ torch
8
+ numpy