Add possibility to replace any characters in filename with "_" char.

This commit is contained in:
Maciej Lebiest 2023-02-08 11:13:47 +01:00
parent 5139a48760
commit 50fe9b7476
2 changed files with 153 additions and 81 deletions

View file

@ -7,6 +7,7 @@ Customizable script for exporting notes from BookStack through API
- export at multiple levels at once (export Books or/and Chapters or/and Pages as files) - export at multiple levels at once (export Books or/and Chapters or/and Pages as files)
- choose if local files should be updated only if their edit timestamp is older than remote document last edit, or timestamps should be ignored and files will always be overwritten with the newest version - choose if local files should be updated only if their edit timestamp is older than remote document last edit, or timestamps should be ignored and files will always be overwritten with the newest version
- customizable path for placing exported notes - customizable path for placing exported notes
- configure replacing any characters in filenames with "_" for any filesystem compatibility
- authorization token is loaded from txt file - authorization token is loaded from txt file
Requirements: Requirements:
@ -21,6 +22,7 @@ python exporter.py \
-H https://wiki.example.com \ -H https://wiki.example.com \
-f pdf markdown plaintext html \ -f pdf markdown plaintext html \
-l pages chapters books \ -l pages chapters books \
-c "/" "#" \
--force-update-files \ --force-update-files \
-t ./token.txt \ -t ./token.txt \
-V debug \ -V debug \
@ -37,6 +39,8 @@ options:
-f {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...], -f {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...],
--formats {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...] --formats {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...]
Space separated list of formats to use for export. Space separated list of formats to use for export.
-c FORBIDDEN_CHARS [FORBIDDEN_CHARS ...], --forbidden-chars FORBIDDEN_CHARS [FORBIDDEN_CHARS ...]
Space separated list of symbols to be replaced with "_" in filenames.
-l {pages,chapters,books} [{pages,chapters,books} ...], --level {pages,chapters,books} [{pages,chapters,books} ...] -l {pages,chapters,books} [{pages,chapters,books} ...], --level {pages,chapters,books} [{pages,chapters,books} ...]
Space separated list of levels at which should be export performed. Space separated list of levels at which should be export performed.
--force-update-files Set this option to skip checking local files timestamps against remote last edit --force-update-files Set this option to skip checking local files timestamps against remote last edit

View file

@ -5,6 +5,7 @@ import os
from datetime import datetime from datetime import datetime
from logging import info, error, debug from logging import info, error, debug
from pathlib import Path from pathlib import Path
import sys
from typing import Union from typing import Union
from urllib.request import urlopen, Request from urllib.request import urlopen, Request
import urllib.parse import urllib.parse
@ -17,11 +18,7 @@ FORMATS: dict['str', 'str'] = {
'html': 'html' 'html': 'html'
} }
LEVELS = [ LEVELS = ['pages', 'chapters', 'books']
'pages',
'chapters',
'books'
]
LOG_LEVEL: dict = { LOG_LEVEL: dict = {
'debug': logging.DEBUG, 'debug': logging.DEBUG,
@ -30,35 +27,77 @@ LOG_LEVEL: dict = {
'error': logging.ERROR 'error': logging.ERROR
} }
# Characters in filenames to be replaced with "_"
FORBIDDEN_CHARS: list[str] = ["/", "#"]
parser = argparse.ArgumentParser(description='BookStack exporter') parser = argparse.ArgumentParser(description='BookStack exporter')
parser.add_argument('-p', '--path', type=str, default='.', parser.add_argument('-p',
'--path',
type=str,
default='.',
help='Path where exported files will be placed.') help='Path where exported files will be placed.')
parser.add_argument('-t', '--token-file', type=str, default=f'.{os.path.sep}token.txt', parser.add_argument(
help='File containing authorization token in format TOKEN_ID:TOKEN_SECRET') '-t',
parser.add_argument('-H', '--host', type=str, default='https://localhost', '--token-file',
help='Your domain with protocol prefix, example: https://example.com') type=str,
parser.add_argument('-f', '--formats', type=str, default=['markdown'], nargs="+", default=f'.{os.path.sep}token.txt',
help=f'Space separated list of formats to use for export.', choices=FORMATS.keys()) help='File containing authorization token in format TOKEN_ID:TOKEN_SECRET')
parser.add_argument('-l', '--level', type=str, default=['pages'], nargs="+", parser.add_argument(
help=f'Space separated list of levels at which should be export performed. ', choices=LEVELS) '-H',
parser.add_argument('--force-update-files', action='store_true', '--host',
help="Set this option to skip checking local files timestamps against remote last edit timestamps." type=str,
"This will cause overwriting local files, even if they seem to be already in newest version.") default='https://localhost',
help='Your domain with protocol prefix, example: https://example.com')
parser.add_argument('-f',
'--formats',
type=str,
default=['markdown'],
nargs="+",
help='Space separated list of formats to use for export.',
choices=FORMATS.keys())
parser.add_argument('-c',
'--forbidden-chars',
type=str,
default=FORBIDDEN_CHARS,
nargs="+",
help='Space separated list of symbols to be replaced '
'with "_" in filenames.')
parser.add_argument(
'-l',
'--level',
type=str,
default=['pages'],
nargs="+",
help="Space separated list of levels at which should be export "
"performed. ",
choices=LEVELS)
parser.add_argument(
'--force-update-files',
action='store_true',
help="Set this option to skip checking local files timestamps against "
"remote last edit timestamps. This will cause overwriting local files,"
" even if they seem to be already in newest version.")
parser.set_defaults(force_update_files=False) parser.set_defaults(force_update_files=False)
parser.add_argument('-V', '--log-level', type=str, default='info', parser.add_argument('-V',
help=f'Set verbosity level. ', choices=LOG_LEVEL.keys()) '--log-level',
type=str,
default='info',
help='Set verbosity level.',
choices=LOG_LEVEL.keys())
args = parser.parse_args() args = parser.parse_args()
logging.basicConfig(format='%(levelname)s :: %(message)s', level=LOG_LEVEL.get(args.log_level)) logging.basicConfig(format='%(levelname)s :: %(message)s',
level=LOG_LEVEL.get(args.log_level))
formats: list[str] = args.formats formats: list[str] = args.formats
FORBIDDEN_CHARS = args.forbidden_chars
for frmt in formats: for frmt in formats:
if frmt not in FORMATS.keys(): if frmt not in FORMATS:
error("Unknown format name (NOT file extension), " error("Unknown format name (NOT file extension), "
"check api docs for current version of your BookStack") "check api docs for current version of your BookStack")
exit(1) sys.exit(1)
API_PREFIX: str = f"{args.host.removesuffix(os.path.sep)}/api" API_PREFIX: str = f"{args.host.removesuffix(os.path.sep)}/api"
FS_PATH: str = args.path.removesuffix(os.path.sep) FS_PATH: str = args.path.removesuffix(os.path.sep)
@ -66,21 +105,25 @@ LEVEL_CHOICE: list[str] = args.level
for lvl in LEVEL_CHOICE: for lvl in LEVEL_CHOICE:
if lvl not in LEVELS: if lvl not in LEVELS:
error(f"Level {lvl} is not supported, can be only one of {LEVELS}") error(f"Level {lvl} is not supported, can be only one of {LEVELS}")
exit(1) sys.exit(1)
with open(args.token_file, 'r') as f: with open(args.token_file, 'r', encoding='utf-8') as f:
TOKEN: str = f.readline().removesuffix('\n') TOKEN: str = f.readline().removesuffix('\n')
HEADERS = {'Content-Type': 'application/json; charset=utf-8', HEADERS = {
'Authorization': f"Token {TOKEN}"} 'Content-Type': 'application/json; charset=utf-8',
'Authorization': f"Token {TOKEN}"
}
SKIP_TIMESTAMPS: bool = args.force_update_files SKIP_TIMESTAMPS: bool = args.force_update_files
class Node: class Node:
def __init__(self, name: str, """Clas representing any node in whole bookstack documents "tree"."""
parent: Union['Node', None],
node_id: int, def __init__(self, name: str, parent: Union['Node', None], node_id: int,
last_edit_timestamp: datetime): last_edit_timestamp: datetime):
for char in FORBIDDEN_CHARS:
name = name.replace(char, "_")
self.__name: str = name self.__name: str = name
self.__children: list['Node'] = [] self.__children: list['Node'] = []
@ -91,10 +134,12 @@ class Node:
self.__last_edit_timestamp: datetime = last_edit_timestamp self.__last_edit_timestamp: datetime = last_edit_timestamp
self.__node_id = node_id self.__node_id = node_id
def get_name(self) -> str: @property
def name(self) -> str:
return self.__name return self.__name
def get_parent(self) -> Union['Node', None]: @property
def parent(self) -> Union['Node', None]:
return self.__parent return self.__parent
def changed_since(self, timestamp: datetime) -> int: def changed_since(self, timestamp: datetime) -> int:
@ -124,7 +169,7 @@ class Node:
def get_path(self) -> str: def get_path(self) -> str:
if self.__parent is None: if self.__parent is None:
return "." return "."
return self.__parent.get_path() + os.path.sep + self.__parent.get_name() return self.__parent.get_path() + os.path.sep + self.__parent.name
def get_id(self) -> int: def get_id(self) -> int:
return self.__node_id return self.__node_id
@ -163,20 +208,21 @@ def api_get_bytes(path: str, **kwargs) -> bytes:
with urlopen(request) as response: with urlopen(request) as response:
if response.status == 403: if response.status == 403:
error("403 Forbidden, check your token!") error("403 Forbidden, check your token!")
exit(response.status) sys.exit(response.status)
return response.read() return response.read()
def api_get_dict(path: str) -> dict: def api_get_dict(path: str) -> dict:
"""Make api request at specified path and return result as dict."""
data = api_get_bytes(path).decode() data = api_get_bytes(path).decode()
return json.loads(data) return json.loads(data)
def api_get_listing(path: str) -> list: def api_get_listing(path: str) -> list:
""" """Retrieve whole lists through api.
function for retrieving whole lists through api, it will
request for another 50 until have collected "total" amount Request for another 50 until have collected "total" amount.
:param path: :param path:
:return: :return:
""" """
@ -186,9 +232,10 @@ def api_get_listing(path: str) -> list:
result: list = [] result: list = []
while total > len(result): while total > len(result):
data: dict = json.loads(api_get_bytes(path, count=count, offset=len(result))) data: dict = json.loads(
total = data.get('total') api_get_bytes(path, count=count, offset=len(result)))
result += data.get('data') total = data['total']
result += data['data']
debug(f"API listing got {total} items out of maximum {count}") debug(f"API listing got {total} items out of maximum {count}")
@ -196,6 +243,7 @@ def api_get_listing(path: str) -> list:
def check_if_update_needed(file_path: str, document: Node) -> bool: def check_if_update_needed(file_path: str, document: Node) -> bool:
"""Check if a Node need updating on disk, according to timestamps."""
if SKIP_TIMESTAMPS: if SKIP_TIMESTAMPS:
return True return True
debug(f"Checking for update for file {file_path}") debug(f"Checking for update for file {file_path}")
@ -203,80 +251,95 @@ def check_if_update_needed(file_path: str, document: Node) -> bool:
if not os.path.exists(file_path): if not os.path.exists(file_path):
debug(f"Document {file_path} is missing on disk, update needed.") debug(f"Document {file_path} is missing on disk, update needed.")
return True return True
local_last_edit: datetime = datetime.utcfromtimestamp(os.path.getmtime(file_path)) local_last_edit: datetime = datetime.utcfromtimestamp(
os.path.getmtime(file_path))
remote_last_edit: datetime = document.get_last_edit_timestamp() remote_last_edit: datetime = document.get_last_edit_timestamp()
debug(f"Local file creation timestamp: {local_last_edit.date()} {local_last_edit.time()}, " debug("Local file creation timestamp: "
f"remote edit timestamp: {remote_last_edit.date()} {remote_last_edit.time()}") f"{local_last_edit.date()} {local_last_edit.time()}, "
"remote edit timestamp: "
f"{remote_last_edit.date()} {remote_last_edit.time()}")
changes: int = document.changed_since(local_last_edit) changes: int = document.changed_since(local_last_edit)
if changes > 0: if changes > 0:
info(f"Document \"{file_path}\" consists of {changes} outdated documents, update needed.") info(f"Document \"{file_path}\" consists of {changes} "
"outdated documents, update needed.")
return True return True
debug(f"Document \"{file_path}\" consists of {changes} outdated documents, skipping updating.") debug(f"Document \"{file_path}\" consists of {changes} "
"outdated documents, skipping updating.")
return False return False
def export(documents: list[Node], level: str): def export(documents: list[Node], level: str):
"""Save Node to file."""
for document in documents: for document in documents:
make_dir(f"{FS_PATH}{os.path.sep}{document.get_path()}") make_dir(f"{FS_PATH}{os.path.sep}{document.get_path()}")
for frmt in formats: for v_format in formats:
path: str = f"{FS_PATH}{os.path.sep}{document.get_path()}{os.path.sep}{document.get_name()}.{FORMATS[frmt]}" path: str = f"{FS_PATH}{os.path.sep}{document.get_path()}" + \
f"{os.path.sep}{document.name}.{FORMATS[v_format]}"
if not check_if_update_needed(path, document): if not check_if_update_needed(path, document):
continue continue
data: bytes = api_get_bytes(f'{level}/{document.get_id()}/export/{frmt}') data: bytes = api_get_bytes(
with open(path, 'wb') as f: f'{level}/{document.get_id()}/export/{v_format}')
with open(path, 'wb') as file:
info(f"Saving {path}") info(f"Saving {path}")
f.write(data) file.write(data)
info("Getting info about Shelves and their Books") info("Getting info about Shelves and their Books")
for shelf_data in api_get_listing('shelves'): for shelf_data in api_get_listing('shelves'):
last_edit_timestamp: datetime = api_timestamp_string_to_datetime(shelf_data['updated_at']) last_edit_ts: datetime = api_timestamp_string_to_datetime(
shelf = Node(shelf_data.get('name'), None, shelf_data.get('id'), last_edit_timestamp) shelf_data['updated_at'])
shelf = Node(shelf_data.get('name'), None, shelf_data.get('id'),
last_edit_ts)
debug(f"Shelf: \"{shelf.get_name()}\", ID: {shelf.get_id()}") debug(f"Shelf: \"{shelf.name}\", ID: {shelf.get_id()}")
shelves[shelf.get_id()] = shelf shelves[shelf.get_id()] = shelf
shelf_details = api_get_dict(f'shelves/{shelf.get_id()}') shelf_details = api_get_dict(f'shelves/{shelf.get_id()}')
if shelf_details.get('books') is None: if shelf_details.get('books') is None:
continue continue
for book_data in shelf_details.get('books'): for book_data in shelf_details['books']:
last_edit_timestamp: datetime = api_timestamp_string_to_datetime(book_data['updated_at']) last_edit_ts: datetime = api_timestamp_string_to_datetime(
book = Node(book_data.get('name'), shelf, book_data.get('id'), last_edit_timestamp) book_data['updated_at'])
debug(f"Book: \"{book.get_name()}\", ID: {book.get_id()}") book = Node(book_data.get('name'), shelf, book_data.get('id'),
last_edit_ts)
debug(f"Book: \"{book.name}\", ID: {book.get_id()}")
books[book.get_id()] = book books[book.get_id()] = book
info("Getting info about Books not belonging to any shelf") info("Getting info about Books not belonging to any shelf")
for book_data in api_get_listing('books'): for book_data in api_get_listing('books'):
if book_data.get('id') in books.keys(): if book_data.get('id') in books:
continue continue
last_edit_timestamp: datetime = api_timestamp_string_to_datetime(book_data['updated_at']) last_edit_ts: datetime = api_timestamp_string_to_datetime(
book = Node(book_data.get('name'), None, book_data.get('id'), last_edit_timestamp) book_data['updated_at'])
book = Node(book_data.get('name'), None, book_data.get('id'), last_edit_ts)
debug(f"Book: \"{book.get_name()}\", ID: {book.get_id()}, last edit: {book.get_last_edit_timestamp()}") debug(f"Book: \"{book.name}\", ID: {book.get_id()}, "
info(f"Book \"{book.get_name()} has no shelf assigned.\"") f"last edit: {book.get_last_edit_timestamp()}")
info(f"Book \"{book.name} has no shelf assigned.\"")
books[book.get_id()] = book books[book.get_id()] = book
info("Getting info about Chapters") info("Getting info about Chapters")
for chapter_data in api_get_listing('chapters'): for chapter_data in api_get_listing('chapters'):
last_edit_timestamp: datetime = api_timestamp_string_to_datetime(chapter_data['updated_at']) last_edit_ts: datetime = api_timestamp_string_to_datetime(
chapter_data['updated_at'])
chapter = Node(chapter_data.get('name'), chapter = Node(chapter_data.get('name'),
books.get(chapter_data.get('book_id')), books.get(chapter_data.get('book_id')),
chapter_data.get('id'), chapter_data.get('id'), last_edit_ts)
last_edit_timestamp) debug(f"Chapter: \"{chapter.name}\", ID: {chapter.get_id()},"
debug(f"Chapter: \"{chapter.get_name()}\", ID: {chapter.get_id()}, last edit: {chapter.get_last_edit_timestamp()}") f" last edit: {chapter.get_last_edit_timestamp()}")
chapters[chapter.get_id()] = chapter chapters[chapter.get_id()] = chapter
info("Getting info about Pages") info("Getting info about Pages")
@ -284,38 +347,43 @@ info("Getting info about Pages")
for page_data in api_get_listing('pages'): for page_data in api_get_listing('pages'):
parent_id = page_data.get('chapter_id') parent_id = page_data.get('chapter_id')
last_edit_timestamp: datetime = api_timestamp_string_to_datetime(page_data['updated_at']) last_edit_ts: datetime = api_timestamp_string_to_datetime(
page_data['updated_at'])
if parent_id not in chapters.keys(): if parent_id not in chapters:
parent = books.get(page_data.get('book_id')) parent = books[page_data['book_id']]
page = Node(page_data.get('name'), parent, page_data.get('id'), last_edit_timestamp) page = Node(page_data.get('name'), parent, page_data.get('id'),
last_edit_ts)
info(f"Page \"{page.get_name()}\" is not in any chapter, " info(f"Page \"{page.name}\" is not in any chapter, "
f"using Book \"{parent.get_name()}\" as a parent.") f"using Book \"{parent.name}\" as a parent.")
debug(f"Page: \"{page.get_name()}\", ID: {page.get_id()}, last edit: {page.get_last_edit_timestamp()}") debug(f"Page: \"{page.name}\", ID: {page.get_id()},"
f" last edit: {page.get_last_edit_timestamp()}")
pages[page.get_id()] = page pages[page.get_id()] = page
pages_not_in_chapter[page.get_id()] = page pages_not_in_chapter[page.get_id()] = page
continue continue
page = Node(page_data.get('name'), chapters.get(parent_id), page_data.get('id'), last_edit_timestamp) page = Node(page_data.get('name'), chapters.get(parent_id),
debug(f"Page: \"{page.get_name()}\", ID: {page.get_id()}, last edit: {page.get_last_edit_timestamp()}") page_data.get('id'), last_edit_ts)
debug(f"Page: \"{page.name}\", ID: {page.get_id()}, "
f"last edit: {page.get_last_edit_timestamp()}")
pages[page.get_id()] = page pages[page.get_id()] = page
files: list[Node] = [] files: list[Node] = []
export_pages_not_in_chapter: bool = False EXPORT_PAGES_NOT_IN_CHAPTER: bool = False
for lvl in LEVEL_CHOICE: for lvl in LEVEL_CHOICE:
if lvl == 'pages': if lvl == 'pages':
files = pages.values() files = list(pages.values())
elif lvl == 'chapters': elif lvl == 'chapters':
files = chapters.values() files = list(chapters.values())
export_pages_not_in_chapter = True EXPORT_PAGES_NOT_IN_CHAPTER = True
elif lvl == 'books': elif lvl == 'books':
files = books.values() files = list(books.values())
export(files, lvl) export(files, lvl)
if export_pages_not_in_chapter: if EXPORT_PAGES_NOT_IN_CHAPTER:
info("Exporting pages that are not in chapter...") info("Exporting pages that are not in chapter...")
export(pages_not_in_chapter.values(), 'pages') export(list(pages_not_in_chapter.values()), 'pages')