import os
import docx
import PyPDF2
from epub_conversion.utils import open_book, convert_epub_to_lines
import subprocess
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import requests

def extract_text_from_txt(file_path):
    """从 TXT 文件中提取文本"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text
    except Exception as e:
        print(f"读取 TXT 文件时出错: {e}")
        return None

def extract_text_from_docx(file_path):
    """从 DOCX 文件中提取文本"""
    try:
        doc = docx.Document(file_path)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)
        return '\n'.join(full_text)
    except Exception as e:
        print(f"读取 DOCX 文件时出错: {e}")
        return None

def extract_text_from_pdf(file_path):
    """从 PDF 文件中提取文本"""
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
        return text
    except Exception as e:
        print(f"读取 PDF 文件时出错: {e}")
        return None

def extract_text_from_epub(epub_path, txt_path):
    # 加载EPUB文件
    book = epub.read_epub(epub_path)
    
    # 创建一个空字符串来存储所有文本
    full_text = ""
    
    # 遍历书中的所有项目
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            # 解析HTML内容
            soup = BeautifulSoup(item.content, 'html.parser')
            # 获取纯文本
            text = soup.get_text()
            # 添加到总文本中
            full_text += text + "\n"
    
    # 将文本写入TXT文件
    with open(txt_path, 'w', encoding='utf-8') as file:
        file.write(full_text)

def extract_text_from_doc(file_path):
    """从 DOC 文件中提取文本"""
    try:
        # 构建新的 DOCX 文件路径
        docx_path = os.path.splitext(file_path)[0] + '.docx'
        # 使用 LibreOffice 将 DOC 文件转换为 DOCX 文件
        subprocess.run(['soffice', '--headless', '--convert-to', 'docx', file_path, '--outdir', os.path.dirname(file_path)])
        # 从转换后的 DOCX 文件中提取文本
        text = extract_text_from_docx(docx_path)
        # 删除临时生成的 DOCX 文件
        if os.path.exists(docx_path):
            os.remove(docx_path)
        return text
    except Exception as e:
        print(f"读取 DOC 文件时出错: {e}")
        return None

def extract_text(file_path):
    """根据文件扩展名选择合适的提取函数"""
    file_extension = os.path.splitext(file_path)[1].lower()
    if file_extension == '.txt':
        return extract_text_from_txt(file_path)
    elif file_extension == '.docx':
        return extract_text_from_docx(file_path)
    elif file_extension == '.pdf':
        return extract_text_from_pdf(file_path)
    elif file_extension == '.epub':
        return extract_text_from_epub(file_path)
    elif file_extension == '.doc':
        return extract_text_from_doc(file_path)
    else:
        print(f"不支持的文件格式: {file_extension}")
        return None

# # 示例使用
# file_path = '/home/saw/file/人生 (路遥).epub'  # 替换为实际的文件路径
# text = extract_text(file_path)
# if text:
#     print(text[:1000])  # 打印前 1000 个字符

def get_file_content_by_path(file_path):
    """获取文件内容"""
    text = extract_text(file_path)
    if text:
        return text
    else:
        print("未能获取文件内容")
        return None


def dowload_file(file_url, save_path):
    """下载文件"""
    try:
        response = requests.get(file_url)
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                file.write(response.content)
            print(f"文件下载成功: {save_path}")
        else:
            print(f"下载失败，状态码: {response.status_code}")
    except Exception as e:
        print(f"下载文件时出错: {e}")

#filepath为fileurl,从网络下载
def get_file_content(fileName):
    save_path = '/home/saw/file/' + fileName
    file_url = 'https://pm.ljsea.top/tool/file/' + fileName
    dowload_file(file_url, save_path)
    """获取文件内容"""
    text = extract_text(save_path)
    if text:
        return text
    else:
        print("未能获取文件内容")
        return None