From 50fe9b747687bf5469e686031f9f4bb9fb8d286f Mon Sep 17 00:00:00 2001
From: Maciej Lebiest <>
Date: Wed, 8 Feb 2023 11:13:47 +0100
Subject: [PATCH] Add possibility to replace any characters in filename with
 "_" char.

---
 README.md   |   4 +
 exporter.py | 230 ++++++++++++++++++++++++++++++++++------------------
 2 files changed, 153 insertions(+), 81 deletions(-)

diff --git a/README.md b/README.md
index b289f21..c40f916 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@ Customizable script for exporting notes from BookStack through API
 - export at multiple levels at once (export Books or/and Chapters or/and Pages as files)
 - choose if local files should be updated only if their edit timestamp is older than remote document last edit, or timestamps should be ignored and files will always be overwritten with the newest version
 - customizable path for placing exported notes
+- configure replacing any characters in filenames with "_" for any filesystem compatibility
 - authorization token is loaded from txt file
 
 Requirements:
@@ -21,6 +22,7 @@ python exporter.py \
     -H https://wiki.example.com \
     -f pdf markdown plaintext html \
     -l pages chapters books \
+    -c "/" "#" \
     --force-update-files \
     -t ./token.txt \
     -V debug \
@@ -37,6 +39,8 @@ options:
   -f {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...], 
                        --formats {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...]
                         Space separated list of formats to use for export.
+  -c FORBIDDEN_CHARS [FORBIDDEN_CHARS ...], --forbidden-chars FORBIDDEN_CHARS [FORBIDDEN_CHARS ...]
+                        Space separated list of symbols to be replaced with "_" in filenames.
   -l {pages,chapters,books} [{pages,chapters,books} ...], --level {pages,chapters,books} [{pages,chapters,books} ...]
                         Space separated list of levels at which should be export performed.
   --force-update-files  Set this option to skip checking local files timestamps against remote last edit
diff --git a/exporter.py b/exporter.py
index 69d97b7..448ef72 100644
--- a/exporter.py
+++ b/exporter.py
@@ -5,6 +5,7 @@ import os
 from datetime import datetime
 from logging import info, error, debug
 from pathlib import Path
+import sys
 from typing import Union
 from urllib.request import urlopen, Request
 import urllib.parse
@@ -17,11 +18,7 @@ FORMATS: dict['str', 'str'] = {
     'html': 'html'
 }
 
-LEVELS = [
-    'pages',
-    'chapters',
-    'books'
-]
+LEVELS = ['pages', 'chapters', 'books']
 
 LOG_LEVEL: dict = {
     'debug': logging.DEBUG,
@@ -30,35 +27,77 @@ LOG_LEVEL: dict = {
     'error': logging.ERROR
 }
 
+# Characters in filenames to be replaced with "_"
+FORBIDDEN_CHARS: list[str] = ["/", "#"]
+
 parser = argparse.ArgumentParser(description='BookStack exporter')
-parser.add_argument('-p', '--path', type=str, default='.',
+parser.add_argument('-p',
+                    '--path',
+                    type=str,
+                    default='.',
                     help='Path where exported files will be placed.')
-parser.add_argument('-t', '--token-file', type=str, default=f'.{os.path.sep}token.txt',
-                    help='File containing authorization token in format TOKEN_ID:TOKEN_SECRET')
-parser.add_argument('-H', '--host', type=str, default='https://localhost',
-                    help='Your domain with protocol prefix, example: https://example.com')
-parser.add_argument('-f', '--formats', type=str, default=['markdown'], nargs="+",
-                    help=f'Space separated list of formats to use for export.', choices=FORMATS.keys())
-parser.add_argument('-l', '--level', type=str, default=['pages'], nargs="+",
-                    help=f'Space separated list of levels at which should be export performed. ', choices=LEVELS)
-parser.add_argument('--force-update-files', action='store_true',
-                    help="Set this option to skip checking local files timestamps against remote last edit timestamps."
-                         "This will cause overwriting local files, even if they seem to be already in newest version.")
+parser.add_argument(
+    '-t',
+    '--token-file',
+    type=str,
+    default=f'.{os.path.sep}token.txt',
+    help='File containing authorization token in format TOKEN_ID:TOKEN_SECRET')
+parser.add_argument(
+    '-H',
+    '--host',
+    type=str,
+    default='https://localhost',
+    help='Your domain with protocol prefix, example: https://example.com')
+parser.add_argument('-f',
+                    '--formats',
+                    type=str,
+                    default=['markdown'],
+                    nargs="+",
+                    help='Space separated list of formats to use for export.',
+                    choices=FORMATS.keys())
+parser.add_argument('-c',
+                    '--forbidden-chars',
+                    type=str,
+                    default=FORBIDDEN_CHARS,
+                    nargs="+",
+                    help='Space separated list of symbols to be replaced '
+                    'with "_" in filenames.')
+parser.add_argument(
+    '-l',
+    '--level',
+    type=str,
+    default=['pages'],
+    nargs="+",
+    help="Space separated list of levels at which should be export "
+    "performed. ",
+    choices=LEVELS)
+parser.add_argument(
+    '--force-update-files',
+    action='store_true',
+    help="Set this option to skip checking local files timestamps against "
+    "remote last edit timestamps. This will cause overwriting local files,"
+    " even if they seem to be already in newest version.")
 parser.set_defaults(force_update_files=False)
-parser.add_argument('-V', '--log-level', type=str, default='info',
-                    help=f'Set verbosity level. ', choices=LOG_LEVEL.keys())
+parser.add_argument('-V',
+                    '--log-level',
+                    type=str,
+                    default='info',
+                    help='Set verbosity level.',
+                    choices=LOG_LEVEL.keys())
 
 args = parser.parse_args()
 
-logging.basicConfig(format='%(levelname)s :: %(message)s', level=LOG_LEVEL.get(args.log_level))
+logging.basicConfig(format='%(levelname)s :: %(message)s',
+                    level=LOG_LEVEL.get(args.log_level))
 
 formats: list[str] = args.formats
+FORBIDDEN_CHARS = args.forbidden_chars
 
 for frmt in formats:
-    if frmt not in FORMATS.keys():
+    if frmt not in FORMATS:
         error("Unknown format name (NOT file extension), "
               "check api docs for current version of your BookStack")
-        exit(1)
+        sys.exit(1)
 
 API_PREFIX: str = f"{args.host.removesuffix(os.path.sep)}/api"
 FS_PATH: str = args.path.removesuffix(os.path.sep)
@@ -66,21 +105,25 @@ LEVEL_CHOICE: list[str] = args.level
 for lvl in LEVEL_CHOICE:
     if lvl not in LEVELS:
         error(f"Level {lvl} is not supported, can be only one of {LEVELS}")
-        exit(1)
+        sys.exit(1)
 
-with open(args.token_file, 'r') as f:
+with open(args.token_file, 'r', encoding='utf-8') as f:
     TOKEN: str = f.readline().removesuffix('\n')
 
-HEADERS = {'Content-Type': 'application/json; charset=utf-8',
-           'Authorization': f"Token {TOKEN}"}
+HEADERS = {
+    'Content-Type': 'application/json; charset=utf-8',
+    'Authorization': f"Token {TOKEN}"
+}
 SKIP_TIMESTAMPS: bool = args.force_update_files
 
 
 class Node:
-    def __init__(self, name: str,
-                 parent: Union['Node', None],
-                 node_id: int,
+    """Clas representing any node in whole bookstack documents "tree"."""
+
+    def __init__(self, name: str, parent: Union['Node', None], node_id: int,
                  last_edit_timestamp: datetime):
+        for char in FORBIDDEN_CHARS:
+            name = name.replace(char, "_")
         self.__name: str = name
         self.__children: list['Node'] = []
 
@@ -91,10 +134,12 @@ class Node:
         self.__last_edit_timestamp: datetime = last_edit_timestamp
         self.__node_id = node_id
 
-    def get_name(self) -> str:
+    @property
+    def name(self) -> str:
         return self.__name
 
-    def get_parent(self) -> Union['Node', None]:
+    @property
+    def parent(self) -> Union['Node', None]:
         return self.__parent
 
     def changed_since(self, timestamp: datetime) -> int:
@@ -124,7 +169,7 @@ class Node:
     def get_path(self) -> str:
         if self.__parent is None:
             return "."
-        return self.__parent.get_path() + os.path.sep + self.__parent.get_name()
+        return self.__parent.get_path() + os.path.sep + self.__parent.name
 
     def get_id(self) -> int:
         return self.__node_id
@@ -163,20 +208,21 @@ def api_get_bytes(path: str, **kwargs) -> bytes:
     with urlopen(request) as response:
         if response.status == 403:
             error("403 Forbidden, check your token!")
-            exit(response.status)
+            sys.exit(response.status)
 
         return response.read()
 
 
 def api_get_dict(path: str) -> dict:
+    """Make api request at specified path and return result as dict."""
     data = api_get_bytes(path).decode()
     return json.loads(data)
 
 
 def api_get_listing(path: str) -> list:
-    """
-    function for retrieving whole lists through api, it will
-    request for another 50 until have collected "total" amount
+    """Retrieve whole lists through api.
+
+    Request for another 50 until have collected "total" amount.
     :param path:
     :return:
     """
@@ -186,9 +232,10 @@ def api_get_listing(path: str) -> list:
     result: list = []
 
     while total > len(result):
-        data: dict = json.loads(api_get_bytes(path, count=count, offset=len(result)))
-        total = data.get('total')
-        result += data.get('data')
+        data: dict = json.loads(
+            api_get_bytes(path, count=count, offset=len(result)))
+        total = data['total']
+        result += data['data']
 
         debug(f"API listing got {total} items out of maximum {count}")
 
@@ -196,6 +243,7 @@ def api_get_listing(path: str) -> list:
 
 
 def check_if_update_needed(file_path: str, document: Node) -> bool:
+    """Check if a Node need updating on disk, according to timestamps."""
     if SKIP_TIMESTAMPS:
         return True
     debug(f"Checking for update for file {file_path}")
@@ -203,80 +251,95 @@ def check_if_update_needed(file_path: str, document: Node) -> bool:
     if not os.path.exists(file_path):
         debug(f"Document {file_path} is missing on disk, update needed.")
         return True
-    local_last_edit: datetime = datetime.utcfromtimestamp(os.path.getmtime(file_path))
+    local_last_edit: datetime = datetime.utcfromtimestamp(
+        os.path.getmtime(file_path))
     remote_last_edit: datetime = document.get_last_edit_timestamp()
 
-    debug(f"Local file creation timestamp: {local_last_edit.date()} {local_last_edit.time()}, "
-          f"remote edit timestamp:  {remote_last_edit.date()} {remote_last_edit.time()}")
+    debug("Local file creation timestamp: "
+          f"{local_last_edit.date()} {local_last_edit.time()}, "
+          "remote edit timestamp:  "
+          f"{remote_last_edit.date()} {remote_last_edit.time()}")
     changes: int = document.changed_since(local_last_edit)
 
     if changes > 0:
-        info(f"Document \"{file_path}\" consists of {changes} outdated documents, update needed.")
+        info(f"Document \"{file_path}\" consists of {changes} "
+             "outdated documents, update needed.")
         return True
 
-    debug(f"Document \"{file_path}\" consists of {changes} outdated documents, skipping updating.")
+    debug(f"Document \"{file_path}\" consists of {changes} "
+          "outdated documents, skipping updating.")
     return False
 
 
 def export(documents: list[Node], level: str):
+    """Save Node to file."""
     for document in documents:
         make_dir(f"{FS_PATH}{os.path.sep}{document.get_path()}")
 
-        for frmt in formats:
-            path: str = f"{FS_PATH}{os.path.sep}{document.get_path()}{os.path.sep}{document.get_name()}.{FORMATS[frmt]}"
+        for v_format in formats:
+            path: str = f"{FS_PATH}{os.path.sep}{document.get_path()}" + \
+                f"{os.path.sep}{document.name}.{FORMATS[v_format]}"
 
             if not check_if_update_needed(path, document):
                 continue
 
-            data: bytes = api_get_bytes(f'{level}/{document.get_id()}/export/{frmt}')
-            with open(path, 'wb') as f:
+            data: bytes = api_get_bytes(
+                f'{level}/{document.get_id()}/export/{v_format}')
+            with open(path, 'wb') as file:
                 info(f"Saving {path}")
-                f.write(data)
+                file.write(data)
 
 
 info("Getting info about Shelves and their Books")
 
 for shelf_data in api_get_listing('shelves'):
 
-    last_edit_timestamp: datetime = api_timestamp_string_to_datetime(shelf_data['updated_at'])
-    shelf = Node(shelf_data.get('name'), None, shelf_data.get('id'), last_edit_timestamp)
+    last_edit_ts: datetime = api_timestamp_string_to_datetime(
+        shelf_data['updated_at'])
+    shelf = Node(shelf_data.get('name'), None, shelf_data.get('id'),
+                 last_edit_ts)
 
-    debug(f"Shelf: \"{shelf.get_name()}\", ID: {shelf.get_id()}")
+    debug(f"Shelf: \"{shelf.name}\", ID: {shelf.get_id()}")
     shelves[shelf.get_id()] = shelf
 
     shelf_details = api_get_dict(f'shelves/{shelf.get_id()}')
 
     if shelf_details.get('books') is None:
         continue
-    for book_data in shelf_details.get('books'):
+    for book_data in shelf_details['books']:
 
-        last_edit_timestamp: datetime = api_timestamp_string_to_datetime(book_data['updated_at'])
-        book = Node(book_data.get('name'), shelf, book_data.get('id'), last_edit_timestamp)
-        debug(f"Book: \"{book.get_name()}\", ID: {book.get_id()}")
+        last_edit_ts: datetime = api_timestamp_string_to_datetime(
+            book_data['updated_at'])
+        book = Node(book_data.get('name'), shelf, book_data.get('id'),
+                    last_edit_ts)
+        debug(f"Book: \"{book.name}\", ID: {book.get_id()}")
         books[book.get_id()] = book
 
 info("Getting info about Books not belonging to any shelf")
 
 for book_data in api_get_listing('books'):
-    if book_data.get('id') in books.keys():
+    if book_data.get('id') in books:
         continue
 
-    last_edit_timestamp: datetime = api_timestamp_string_to_datetime(book_data['updated_at'])
-    book = Node(book_data.get('name'), None, book_data.get('id'), last_edit_timestamp)
+    last_edit_ts: datetime = api_timestamp_string_to_datetime(
+        book_data['updated_at'])
+    book = Node(book_data.get('name'), None, book_data.get('id'), last_edit_ts)
 
-    debug(f"Book: \"{book.get_name()}\", ID: {book.get_id()}, last edit: {book.get_last_edit_timestamp()}")
-    info(f"Book \"{book.get_name()} has no shelf assigned.\"")
+    debug(f"Book: \"{book.name}\", ID: {book.get_id()}, "
+          f"last edit: {book.get_last_edit_timestamp()}")
+    info(f"Book \"{book.name} has no shelf assigned.\"")
     books[book.get_id()] = book
 
 info("Getting info about Chapters")
 
 for chapter_data in api_get_listing('chapters'):
-    last_edit_timestamp: datetime = api_timestamp_string_to_datetime(chapter_data['updated_at'])
+    last_edit_ts: datetime = api_timestamp_string_to_datetime(
+        chapter_data['updated_at'])
     chapter = Node(chapter_data.get('name'),
                    books.get(chapter_data.get('book_id')),
-                   chapter_data.get('id'),
-                   last_edit_timestamp)
-    debug(f"Chapter: \"{chapter.get_name()}\", ID: {chapter.get_id()}, last edit: {chapter.get_last_edit_timestamp()}")
+                   chapter_data.get('id'), last_edit_ts)
+    debug(f"Chapter: \"{chapter.name}\", ID: {chapter.get_id()},"
+          f" last edit: {chapter.get_last_edit_timestamp()}")
     chapters[chapter.get_id()] = chapter
 
 info("Getting info about Pages")
@@ -284,38 +347,43 @@ info("Getting info about Pages")
 for page_data in api_get_listing('pages'):
     parent_id = page_data.get('chapter_id')
 
-    last_edit_timestamp: datetime = api_timestamp_string_to_datetime(page_data['updated_at'])
+    last_edit_ts: datetime = api_timestamp_string_to_datetime(
+        page_data['updated_at'])
 
-    if parent_id not in chapters.keys():
-        parent = books.get(page_data.get('book_id'))
-        page = Node(page_data.get('name'), parent, page_data.get('id'), last_edit_timestamp)
+    if parent_id not in chapters:
+        parent = books[page_data['book_id']]
+        page = Node(page_data.get('name'), parent, page_data.get('id'),
+                    last_edit_ts)
 
-        info(f"Page \"{page.get_name()}\" is not in any chapter, "
-             f"using Book \"{parent.get_name()}\" as a parent.")
+        info(f"Page \"{page.name}\" is not in any chapter, "
+             f"using Book \"{parent.name}\" as a parent.")
 
-        debug(f"Page: \"{page.get_name()}\", ID: {page.get_id()}, last edit: {page.get_last_edit_timestamp()}")
+        debug(f"Page: \"{page.name}\", ID: {page.get_id()},"
+              f" last edit: {page.get_last_edit_timestamp()}")
         pages[page.get_id()] = page
         pages_not_in_chapter[page.get_id()] = page
         continue
 
-    page = Node(page_data.get('name'), chapters.get(parent_id), page_data.get('id'), last_edit_timestamp)
-    debug(f"Page: \"{page.get_name()}\", ID: {page.get_id()}, last edit: {page.get_last_edit_timestamp()}")
+    page = Node(page_data.get('name'), chapters.get(parent_id),
+                page_data.get('id'), last_edit_ts)
+    debug(f"Page: \"{page.name}\", ID: {page.get_id()}, "
+          f"last edit: {page.get_last_edit_timestamp()}")
     pages[page.get_id()] = page
 
 files: list[Node] = []
-export_pages_not_in_chapter: bool = False
+EXPORT_PAGES_NOT_IN_CHAPTER: bool = False
 
 for lvl in LEVEL_CHOICE:
     if lvl == 'pages':
-        files = pages.values()
+        files = list(pages.values())
     elif lvl == 'chapters':
-        files = chapters.values()
-        export_pages_not_in_chapter = True
+        files = list(chapters.values())
+        EXPORT_PAGES_NOT_IN_CHAPTER = True
     elif lvl == 'books':
-        files = books.values()
+        files = list(books.values())
 
     export(files, lvl)
 
-if export_pages_not_in_chapter:
+if EXPORT_PAGES_NOT_IN_CHAPTER:
     info("Exporting pages that are not in chapter...")
-    export(pages_not_in_chapter.values(), 'pages')
+    export(list(pages_not_in_chapter.values()), 'pages')