Felipe Silva commited on
Commit
d0c774c
·
1 Parent(s): 7a6c415

teste streamlit

Browse files
Files changed (3) hide show
  1. app.py +63 -8
  2. rag_utils.py +95 -0
  3. utils.py +69 -0
app.py CHANGED
@@ -1,14 +1,69 @@
1
- import gradio as gr
2
- import spaces
3
  import torch
4
 
5
  zero = torch.Tensor([0]).cuda()
6
  print(zero.device) # <-- 'cpu' 🤔
7
 
8
- @spaces.GPU
9
- def greet(n):
10
- print(zero.device) # <-- 'cuda:0' 🤗
11
- return f"Hello {zero + n} Tensor"
12
 
13
- demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
14
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
 
3
  zero = torch.Tensor([0]).cuda()
4
  print(zero.device) # <-- 'cpu' 🤔
5
 
6
+ import streamlit as st
7
+ # from streamlit_pdf_viewer import pdf_viewer
8
+ from utils import read_file_pdf, fix_type, extract_content_in_pdf, EXTENSIONS_FILES, EXTENSIONS_IMG_FILES
9
+ from rag_utils import create_split_doc, store_docs, create_rag_chain
10
 
11
+ st.write("## Pergunte qualquer coisa para seu arquivo.")
12
+ st.write(
13
+ ":dog: Faça o upload do seu arquivo e pergunte qualquer coisa a ele! Este código é open source e disponível [aqui](https://github.com/FelipeErmeson) no GitHub. :grin:"
14
+ )
15
+ st.sidebar.write("## Upload :gear:")
16
+
17
+ # Increased file size limit
18
+ MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB
19
+
20
+ # UI Layout
21
+ col1, col2 = st.columns(2)
22
+ my_upload = st.sidebar.file_uploader("Upload da imagem", type=["png", "jpg", "jpeg", "pdf"])
23
+
24
+ # Information about limitations
25
+ with st.sidebar.expander("ℹ️ Diretrizes da Imagem"):
26
+ st.write("""
27
+ - Tamanho máximo do arquivo: 10MB
28
+ - Imagens enormes são automaticamente redimensionadas
29
+ - Formatos suportados: PNG, JPG, JPEG, PDF
30
+ - Processamento de tempo depende da GPU alocada
31
+ """)
32
+
33
+ # Processa o arquivo
34
+ if my_upload is not None:
35
+ if my_upload.size > MAX_FILE_SIZE:
36
+ st.error(f"O arquivo excede o limite. Por favor, realize o upload de um arquivo que contenha no máximo {MAX_FILE_SIZE/1024/1024:.1f}MB.")
37
+ else:
38
+ print(my_upload)
39
+ print(my_upload.type)
40
+ # binary_data = my_upload.getvalue()
41
+ # pdf_viewer(input=binary_data, width=700)
42
+ # read_file_pdf()
43
+ # fix_image(upload=my_upload)
44
+
45
+ file, type_file = fix_type(my_upload)
46
+ print('type_file', type_file)
47
+ texto_extraido = None
48
+ if type_file in EXTENSIONS_FILES:
49
+ texto_extraido = extract_content_in_pdf(file)
50
+ elif type_file in EXTENSIONS_IMG_FILES:
51
+ pass
52
+
53
+ print(texto_extraido)
54
+
55
+ if texto_extraido is not None:
56
+ col1.write("#### Texto extraído:")
57
+ col1.write(texto_extraido)
58
+
59
+ docs_splitted = create_split_doc(texto_extraido)
60
+ vector_store = store_docs(docs_splitted)
61
+
62
+ if question := col2.chat_input("Faça uma pergunta ao seu documento!"):
63
+ col2.write("📌 " + question)
64
+
65
+ rag_chain = create_rag_chain(vector_store)
66
+ resposta = rag_chain.run(question)
67
+ col2.write("🎩 " + resposta)
68
+
69
+
rag_utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain_community.vectorstores import FAISS
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from langchain.prompts import PromptTemplate
5
+
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
7
+ from langchain.llms import HuggingFacePipeline
8
+
9
+ from langchain.chat_models import ChatOpenAI
10
+ from langchain.chains import RetrievalQA
11
+
12
+ import torch
13
+ print(torch.cuda.is_available())
14
+ print(torch.cuda.get_device_name(0))
15
+ device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
16
+
17
+ import os
18
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
19
+ cache_dir = "/home/user/.cache/huggingface" #"./model/qwen-awq" #"/home/felipe/.cache/huggingface/transformers" #"/home/user/.cache/huggingface"
20
+
21
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
22
+
23
+ model_name = "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8" #"Qwen/Qwen2.5-7B-Instruct-AWQ" #"Qwen/Qwen2.5-7B-Instruct"
24
+ model = AutoModelForCausalLM.from_pretrained(
25
+ model_name,
26
+ torch_dtype="auto",
27
+ device_map="auto",
28
+ trust_remote_code=True,
29
+ cache_dir=cache_dir
30
+ )
31
+ model.to(device)
32
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir=cache_dir)
33
+
34
+ pipe = pipeline(
35
+ "text-generation",
36
+ model=model,
37
+ tokenizer=tokenizer,
38
+ max_new_tokens=512,
39
+ temperature=0.1,
40
+ do_sample=False
41
+ )
42
+
43
+ # Adapta para LangChain
44
+ llm = HuggingFacePipeline(pipeline=pipe)
45
+
46
+ def create_split_doc(raw_text):
47
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
48
+ docs = text_splitter.create_documents([raw_text])
49
+
50
+ return docs
51
+
52
+ def store_docs(docs):
53
+ vectorstore = FAISS.from_documents(docs, embedding_model)
54
+ return vectorstore
55
+
56
+ def create_template():
57
+ prompt_template = PromptTemplate(
58
+ input_variables=["context", "question"],
59
+ template="""
60
+ Você é um especialista em extrair informações em documentos.
61
+ Com base nas informações a seguir, forneça a melhor resposta.
62
+ Caso não tenha certeza da resposta, prefira falar que não sabe responder tal pergunta.
63
+ Responda de maneira amigável e clara.
64
+
65
+ Contexto:
66
+ {context}
67
+
68
+ Pergunta:
69
+ {question}
70
+ """
71
+ )
72
+ return prompt_template
73
+
74
+ def create_rag_chain(vectorstore):
75
+ rag_chain = RetrievalQA.from_chain_type(
76
+ llm=llm,
77
+ retriever=vectorstore.as_retriever(),
78
+ chain_type="stuff",
79
+ chain_type_kwargs={"prompt": create_template()}
80
+ )
81
+ return rag_chain
82
+
83
+
84
+ if __name__ == '__main__':
85
+ pass
86
+
87
+ # resposta = rag_chain.run(pergunta)
88
+
89
+
90
+
91
+ # pergunta = "Qual o número da nfse?"
92
+ # resposta = rag_chain.run(pergunta)
93
+
94
+ # print("📌 Pergunta:", pergunta)
95
+ # print("🎩 Resposta do Analista Fiscal:\\n", resposta)
utils.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ from PIL import Image
3
+ from io import BytesIO
4
+
5
+ EXTENSIONS_IMG_FILES = ['jpeg', 'jpg', 'png']
6
+ EXTENSIONS_FILES = ['pdf']
7
+ EXTENSIONS_ALLOWED = EXTENSIONS_IMG_FILES + EXTENSIONS_FILES
8
+
9
+ # Max dimensions for processing
10
+ MAX_IMAGE_SIZE = 2000 # pixels
11
+
12
+ def fix_type(file_upload):
13
+ if isinstance(file_upload, str):
14
+ print('teste: str')
15
+ else:
16
+ type_file = file_upload.type.split('/')[-1]
17
+ if type_file in EXTENSIONS_IMG_FILES:
18
+ return read_file_img(file_upload), type_file
19
+ elif type_file in EXTENSIONS_FILES:
20
+ return read_file_pdf(file_upload), type_file
21
+
22
+ # Resize image while maintaining aspect ratio
23
+ def resize_image(image, max_size):
24
+ width, height = image.size
25
+ if width <= max_size and height <= max_size:
26
+ return image
27
+
28
+ if width > height:
29
+ new_width = max_size
30
+ new_height = int(height * (max_size / width))
31
+ else:
32
+ new_height = max_size
33
+ new_width = int(width * (max_size / height))
34
+
35
+ return image.resize((new_width, new_height), Image.LANCZOS)
36
+
37
+ def process_image(image_bytes):
38
+ try:
39
+ image = Image.open(BytesIO(image_bytes))
40
+ # Resize large images to prevent memory issues
41
+ # resized = resize_image(image, MAX_IMAGE_SIZE)
42
+ return image
43
+ except Exception as e:
44
+ # st.error(f"Error processing image: {str(e)}")
45
+ return None
46
+
47
+ def read_file_img(file_img):
48
+ image_bytes = file_img.getvalue()
49
+ img_pil = process_image(image_bytes)
50
+ return img_pil
51
+
52
+ def read_file_pdf(file_pdf):
53
+ # image_bytes = file_pdf.getvalue()
54
+ reader = PdfReader(file_pdf)
55
+ return reader
56
+
57
+ def extract_content_in_pdf(reader):
58
+ raw_text = ""
59
+ for page in reader.pages:
60
+ text = page.extract_text()
61
+ if text:
62
+ raw_text += text + "\\n"
63
+
64
+ return raw_text
65
+
66
+ # st.write(f"O PDF tem {num_pages} páginas.")
67
+
68
+ if __name__ == '__main__':
69
+ pass