Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,7 +7,7 @@ from sentence_transformers import SentenceTransformer
|
|
| 7 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 8 |
from langchain.prompts import PromptTemplate
|
| 9 |
from langchain_community.llms.huggingface_hub import HuggingFaceHub
|
| 10 |
-
from read_photodocument import convert_PDF_to_Text
|
| 11 |
from doctr.io import DocumentFile
|
| 12 |
from doctr.models import ocr_predictor
|
| 13 |
import contextlib
|
|
@@ -61,6 +61,7 @@ def summarize_data(docs,llm_model,chain_type='refine'):
|
|
| 61 |
|
| 62 |
prompt = PromptTemplate.from_template(prompt_template)
|
| 63 |
refine_prompt = PromptTemplate.from_template(refine_template)
|
|
|
|
| 64 |
|
| 65 |
chain = load_summarize_chain(llm=llm_model,
|
| 66 |
chain_type=chain_type,
|
|
@@ -76,6 +77,7 @@ def summarize_data(docs,llm_model,chain_type='refine'):
|
|
| 76 |
consice_sumary = re.search("CONCISE SUMMARY:.*\.*$", output_text).group(0)
|
| 77 |
dash_id = consice_sumary.find('-')
|
| 78 |
return consice_sumary[:dash_id].replace(' ','\n')
|
|
|
|
| 79 |
# matches = re.finditer(regex, output_text, re.DOTALL)
|
| 80 |
# for matchNum, match in enumerate(matches, start=1):
|
| 81 |
# for groupNum in range(0, len(match.groups())):
|
|
@@ -115,6 +117,15 @@ def document_loader(temperature,max_tokens,api_key,model_name,file_path):
|
|
| 115 |
was_truncated = conversion_stats["truncated"]
|
| 116 |
print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
if converted_txt:
|
| 119 |
print("Document Processed ..")
|
| 120 |
texts = process_documents(texts=converted_txt)
|
|
|
|
| 7 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 8 |
from langchain.prompts import PromptTemplate
|
| 9 |
from langchain_community.llms.huggingface_hub import HuggingFaceHub
|
| 10 |
+
from read_photodocument import convert_PDF_to_Text,convert_image_to_pdf
|
| 11 |
from doctr.io import DocumentFile
|
| 12 |
from doctr.models import ocr_predictor
|
| 13 |
import contextlib
|
|
|
|
| 61 |
|
| 62 |
prompt = PromptTemplate.from_template(prompt_template)
|
| 63 |
refine_prompt = PromptTemplate.from_template(refine_template)
|
| 64 |
+
|
| 65 |
|
| 66 |
chain = load_summarize_chain(llm=llm_model,
|
| 67 |
chain_type=chain_type,
|
|
|
|
| 77 |
consice_sumary = re.search("CONCISE SUMMARY:.*\.*$", output_text).group(0)
|
| 78 |
dash_id = consice_sumary.find('-')
|
| 79 |
return consice_sumary[:dash_id].replace(' ','\n')
|
| 80 |
+
|
| 81 |
# matches = re.finditer(regex, output_text, re.DOTALL)
|
| 82 |
# for matchNum, match in enumerate(matches, start=1):
|
| 83 |
# for groupNum in range(0, len(match.groups())):
|
|
|
|
| 117 |
was_truncated = conversion_stats["truncated"]
|
| 118 |
print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
|
| 119 |
|
| 120 |
+
elif file_path.endswith('.jpg') or file_path.endswith('.jpeg'):
|
| 121 |
+
conversion_stats = convert_image_to_pdf(file_path,model)
|
| 122 |
+
converted_txt = conversion_stats["converted_text"]
|
| 123 |
+
num_pages = conversion_stats["num_pages"]
|
| 124 |
+
was_truncated = conversion_stats["truncated"]
|
| 125 |
+
print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
|
| 126 |
+
|
| 127 |
+
else:
|
| 128 |
+
return ("Invalid Format ....")
|
| 129 |
if converted_txt:
|
| 130 |
print("Document Processed ..")
|
| 131 |
texts = process_documents(texts=converted_txt)
|