diff --git a/kbase_py/fileBase.py b/kbase_py/fileBase.py index d3f3c55..ad93153 100644 --- a/kbase_py/fileBase.py +++ b/kbase_py/fileBase.py @@ -43,13 +43,13 @@ def extract_text_from_pdf(file_path): print(f"读取 PDF 文件时出错: {e}") return None -def extract_text_from_epub(epub_path, txt_path): +def extract_text_from_epub(epub_path): # 加载EPUB文件 book = epub.read_epub(epub_path) - + # 创建一个空字符串来存储所有文本 full_text = "" - + # 遍历书中的所有项目 for item in book.get_items(): if item.get_type() == ebooklib.ITEM_DOCUMENT: @@ -59,10 +59,11 @@ def extract_text_from_epub(epub_path, txt_path): text = soup.get_text() # 添加到总文本中 full_text += text + "\n" - + # 将文本写入TXT文件 - with open(txt_path, 'w', encoding='utf-8') as file: - file.write(full_text) + # with open(txt_path, 'w', encoding='utf-8') as file: + # file.write(full_text) + return full_text def extract_text_from_doc(file_path): """从 DOC 文件中提取文本""" @@ -130,6 +131,10 @@ def dowload_file(file_url, save_path): #filepath为fileurl,从网络下载 def get_file_content(fileName): save_path = '/home/saw/file/' + fileName + #查看文件是否存在 + if os.path.exists(save_path): + print(f"文件已存在: {save_path}") + return extract_text(save_path) file_url = 'https://pm.ljsea.top/tool/file/' + fileName dowload_file(file_url, save_path) """获取文件内容""" diff --git a/kbase_py/fileCon.py b/kbase_py/fileCon.py index 03b4dc8..fca1c5b 100644 --- a/kbase_py/fileCon.py +++ b/kbase_py/fileCon.py @@ -89,9 +89,18 @@ def get_file_add_convert(): file_list = get_file() if file_list: for file in file_list: + #文件类型是否允许 + if file["FileStoreName"].endswith(".txt") or file["FileStoreName"].endswith(".docx") or file["FileStoreName"].endswith(".pdf") or file["FileStoreName"].endswith(".epub"): + print("文件类型允许") + else: + print("文件类型不允许:", file["FileStoreName"]) + continue print("文件ID:", file["ID"]) print("文件名称:", file["FileName"], "\t", file["FileStoreName"]) text = fileBase.get_file_content(file["FileStoreName"]) + if text is None: + print("文件内容获取失败") + return None print("文件内容长度:", len(text)) #将文件内容上传 print(create_file_content(file["ID"], text)) @@ -120,5 +129,5 @@ def get_Kbase_server_id(): print(f"请求失败,状态码: {response.status_code}, 错误信息: {response.text}") return None -# if __name__ == "__main__": -# get_file_add_convert() \ No newline at end of file +if __name__ == "__main__": + get_file_add_convert() \ No newline at end of file