From fb65dfad9943b857f2df367648926e6ea60febf3 Mon Sep 17 00:00:00 2001 From: Maciej Lebiest Date: Thu, 23 Jun 2022 14:34:28 +0200 Subject: [PATCH 01/10] add debug for printing elements names and id numbers --- exporter.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/exporter.py b/exporter.py index e5cd7ee..cc2c723 100644 --- a/exporter.py +++ b/exporter.py @@ -127,7 +127,8 @@ def api_get_bytes(path: str) -> bytes: def api_get_dict(path: str) -> dict: - return json.loads(api_get_bytes(path).decode()) + data = api_get_bytes(path).decode() + return json.loads(data) def check_if_update_needed(file: str, remote_last_edit: datetime) -> bool: @@ -163,6 +164,7 @@ info("Getting info about Shelves and their Books") for shelf_data in api_get_dict('shelves').get('data'): shelf = Node(shelf_data.get('name'), None, shelf_data.get('id')) + debug(f"Shelf: \"{shelf.get_name()}\", ID: {shelf.get_id()}") shelves[shelf.get_id()] = shelf shelf_details = api_get_dict(f'shelves/{shelf.get_id()}') @@ -171,6 +173,7 @@ for shelf_data in api_get_dict('shelves').get('data'): continue for book_data in shelf_details.get('books'): book = Node(book_data.get('name'), shelf, book_data.get('id')) + debug(f"Book: \"{book.get_name()}\", ID: {book.get_id()}") books[book.get_id()] = book info("Getting info about Books not belonging to any shelf") @@ -179,6 +182,7 @@ for book_data in api_get_dict('books').get('data'): if book_data.get('id') in books.keys(): continue book = Node(book_data.get('name'), None, book_data.get('id')) + debug(f"Book: \"{book.get_name()}\", ID: {book.get_id()}") info(f"Book \"{book.get_name()} has no shelf assigned.\"") books[book.get_id()] = book @@ -186,6 +190,7 @@ info("Getting info about Chapters") for chapter_data in api_get_dict('chapters').get('data'): chapter = Node(chapter_data.get('name'), books.get(chapter_data.get('book_id')), chapter_data.get('id')) + debug(f"Chapter: \"{chapter.get_name()}\", ID: {chapter.get_id()}") chapters[chapter.get_id()] = chapter info("Getting info about Pages") @@ -197,11 +202,13 @@ for page_data in api_get_dict('pages').get('data'): info(f"Page \"{page_data.get('name')}\" is not in any chapter, " f"using Book \"{books.get(parent_id).get_name()}\" as a parent.") page = Node(page_data.get('name'), books.get(parent_id), page_data.get('id')) + debug(f"Page: \"{page.get_name()}\", ID: {page.get_id()}") pages[page.get_id()] = page pages_not_in_chapter[page.get_id()] = page continue page = Node(page_data.get('name'), chapters.get(parent_id), page_data.get('id')) + debug(f"Page: \"{page.get_name()}\", ID: {page.get_id()}") pages[page.get_id()] = page files: list[Node] = [] From e2bc12eb6076464c2dc0fb594ebbda0966564fbc Mon Sep 17 00:00:00 2001 From: Maciej Lebiest Date: Thu, 23 Jun 2022 17:14:45 +0200 Subject: [PATCH 02/10] properly retrieve full lists of objects through api, fix #2 --- exporter.py | 46 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/exporter.py b/exporter.py index cc2c723..6605b98 100644 --- a/exporter.py +++ b/exporter.py @@ -6,6 +6,7 @@ from datetime import datetime from logging import info, error, debug from pathlib import Path from urllib.request import urlopen, Request +import urllib.parse # (formatName, fileExtension) FORMATS: dict['str', 'str'] = { @@ -114,11 +115,18 @@ def make_dir(path: str): path_obj.mkdir(exist_ok=True, parents=True) -def api_get_bytes(path: str) -> bytes: - request: Request = Request(f'{API_PREFIX}/{path}', headers=HEADERS) +def api_get_bytes(path: str, **kwargs) -> bytes: + request_path: str = f'{API_PREFIX}/{path}' + + if len(kwargs) > 0: + params: str = urllib.parse.urlencode(kwargs) + request_path += f"?{params}" + + debug(f"Making http request: {request_path}") + + request: Request = Request(request_path, headers=HEADERS) with urlopen(request) as response: - response = response if response.status == 403: error("403 Forbidden, check your token!") exit(response.status) @@ -131,6 +139,28 @@ def api_get_dict(path: str) -> dict: return json.loads(data) +def api_get_listing(path: str) -> list: + """ + function for retrieving whole lists through api, it will + request for another 50 until have collected "total" amount + :param path: + :return: + """ + count: int = 50 + total: int = count + + result: list = [] + + while total > len(result): + data: dict = json.loads(api_get_bytes(path, count=count, offset=len(result))) + total = data.get('total') + result += data.get('data') + + debug(f"API listing got {total} items out of maximum {count}") + + return result + + def check_if_update_needed(file: str, remote_last_edit: datetime) -> bool: if not os.path.exists(file): return True @@ -162,7 +192,7 @@ def export(files: list[Node], level: str): info("Getting info about Shelves and their Books") -for shelf_data in api_get_dict('shelves').get('data'): +for shelf_data in api_get_listing('shelves'): shelf = Node(shelf_data.get('name'), None, shelf_data.get('id')) debug(f"Shelf: \"{shelf.get_name()}\", ID: {shelf.get_id()}") shelves[shelf.get_id()] = shelf @@ -178,7 +208,7 @@ for shelf_data in api_get_dict('shelves').get('data'): info("Getting info about Books not belonging to any shelf") -for book_data in api_get_dict('books').get('data'): +for book_data in api_get_listing('books'): if book_data.get('id') in books.keys(): continue book = Node(book_data.get('name'), None, book_data.get('id')) @@ -188,15 +218,16 @@ for book_data in api_get_dict('books').get('data'): info("Getting info about Chapters") -for chapter_data in api_get_dict('chapters').get('data'): +for chapter_data in api_get_listing('chapters'): chapter = Node(chapter_data.get('name'), books.get(chapter_data.get('book_id')), chapter_data.get('id')) debug(f"Chapter: \"{chapter.get_name()}\", ID: {chapter.get_id()}") chapters[chapter.get_id()] = chapter info("Getting info about Pages") -for page_data in api_get_dict('pages').get('data'): +for page_data in api_get_listing('pages'): parent_id = page_data.get('chapter_id') + if parent_id not in chapters.keys(): parent_id = page_data.get('book_id') info(f"Page \"{page_data.get('name')}\" is not in any chapter, " @@ -222,6 +253,7 @@ for lvl in LEVEL_CHOICE: export_pages_not_in_chapter = True elif lvl == 'books': files = books.values() + export(files, lvl) if export_pages_not_in_chapter: From ff31611d46b5f57249c0dc342370b35899a3ed7f Mon Sep 17 00:00:00 2001 From: Maciej Lebiest Date: Tue, 26 Jul 2022 17:30:08 +0200 Subject: [PATCH 03/10] add timestamps checking to prevent update of files that contain newest version --- exporter.py | 125 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 90 insertions(+), 35 deletions(-) diff --git a/exporter.py b/exporter.py index 6605b98..f704b62 100644 --- a/exporter.py +++ b/exporter.py @@ -5,6 +5,7 @@ import os from datetime import datetime from logging import info, error, debug from pathlib import Path +from typing import Union from urllib.request import urlopen, Request import urllib.parse @@ -37,14 +38,11 @@ parser.add_argument('-t', '--token-file', type=str, default=f'.{os.path.sep}toke parser.add_argument('-H', '--host', type=str, default='https://localhost', help='Your domain with protocol prefix, example: https://example.com') parser.add_argument('-f', '--formats', type=str, default='markdown', - help=f'Coma separated list of formats to use for export.' - f' Available ones: {",".join([f for f in FORMATS.keys()])}') + help=f'Coma separated list of formats to use for export.', choices=FORMATS.keys()) parser.add_argument('-l', '--level', type=str, default='pages', - help=f'Coma separated list of levels at which should be export performed. ' - f'Available levels: {LEVELS}') + help=f'Coma separated list of levels at which should be export performed. ', choices=LEVELS) parser.add_argument('-V', '--log-level', type=str, default='info', - help=f'Set verbosity level. ' - f'Available levels: {LOG_LEVEL.keys()}') + help=f'Set verbosity level. ', choices=LOG_LEVEL.keys()) args = parser.parse_args() @@ -77,19 +75,49 @@ HEADERS = {'Content-Type': 'application/json; charset=utf-8', class Node: - def __init__(self, name: str, parent: ['Node', None], node_id: int): + def __init__(self, name: str, + parent: Union['Node', None], + node_id: int, + last_edit_timestamp: datetime): self.__name: str = name - self.__parent: ['Node', None] = parent + self.__children: list['Node'] = [] + + self.__parent: Union['Node', None] = parent + if parent is not None: + parent.add_child(self) + + self.__last_edit_timestamp: datetime = last_edit_timestamp self.__node_id = node_id def get_name(self) -> str: return self.__name - def get_parent(self) -> ['Node', None]: + def get_parent(self) -> Union['Node', None]: return self.__parent + def changed_since(self, timestamp: datetime) -> int: + """ + Check if remote version have changed after given timestamp, including its children + :param timestamp: + :return: amount of changed documents at level of this document Node + """ + result: int = 0 + if self.__last_edit_timestamp > timestamp: + result += 1 + for child in self.__children: + result += child.changed_since(timestamp) + + return result + + def get_last_edit_timestamp(self) -> datetime: + return self.__last_edit_timestamp + def set_parent(self, parent: 'Node'): self.__parent = parent + parent.add_child(self) + + def add_child(self, child: 'Node'): + self.__children.append(child) def get_path(self) -> str: if self.__parent is None: @@ -107,6 +135,10 @@ pages: dict[int, Node] = {} pages_not_in_chapter: dict[int, Node] = {} +def api_timestamp_string_to_datetime(timestamp: str) -> datetime: + return datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ') + + def make_dir(path: str): path_obj = Path(path) if path_obj.exists(): @@ -161,30 +193,37 @@ def api_get_listing(path: str) -> list: return result -def check_if_update_needed(file: str, remote_last_edit: datetime) -> bool: - if not os.path.exists(file): +def check_if_update_needed(file_path: str, document: Node) -> bool: + if not os.path.exists(file_path): + debug(f"Document {file_path} is missing on disk, update needed.") return True - local_last_edit: datetime = datetime.fromtimestamp(os.path.getmtime(file)) + local_last_edit: datetime = datetime.utcfromtimestamp(os.path.getmtime(file_path)) + remote_last_edit: datetime = document.get_last_edit_timestamp() + debug(f"Local file creation timestamp: {local_last_edit.date()} {local_last_edit.time()}, " f"remote edit timestamp: {remote_last_edit.date()} {remote_last_edit.time()}") - return local_last_edit.timestamp() < remote_last_edit.timestamp() + changes: int = document.changed_since(local_last_edit) + + if changes > 0: + info(f"Document \"{document.get_name()}\" consists of {changes} outdated documents, update needed.") + return True + + debug(f"Document \"{document.get_name()}\" consists of {changes} outdated documents.") + return False -def export(files: list[Node], level: str): - for file in files: - make_dir(f"{FS_PATH}{os.path.sep}{file.get_path()}") - - file_info: dict = api_get_dict(f'{level}/{file.get_id()}') - last_edit_time: datetime = datetime.strptime(file_info['updated_at'], '%Y-%m-%dT%H:%M:%S.%fZ') +def export(documents: list[Node], level: str): + for document in documents: + make_dir(f"{FS_PATH}{os.path.sep}{document.get_path()}") for frmt in formats: - path: str = f"{FS_PATH}{os.path.sep}{file.get_path()}{os.path.sep}{file.get_name()}.{FORMATS[frmt]}" + path: str = f"{FS_PATH}{os.path.sep}{document.get_path()}{os.path.sep}{document.get_name()}.{FORMATS[frmt]}" debug(f"Checking for update for file {path}") - if not check_if_update_needed(path, last_edit_time): + if not check_if_update_needed(path, document): debug("Already updated") continue - data: bytes = api_get_bytes(f'{level}/{file.get_id()}/export/{frmt}') + data: bytes = api_get_bytes(f'{level}/{document.get_id()}/export/{frmt}') with open(path, 'wb') as f: info(f"Saving {path}") f.write(data) @@ -193,7 +232,10 @@ def export(files: list[Node], level: str): info("Getting info about Shelves and their Books") for shelf_data in api_get_listing('shelves'): - shelf = Node(shelf_data.get('name'), None, shelf_data.get('id')) + + last_edit_timestamp: datetime = api_timestamp_string_to_datetime(shelf_data['updated_at']) + shelf = Node(shelf_data.get('name'), None, shelf_data.get('id'), last_edit_timestamp) + debug(f"Shelf: \"{shelf.get_name()}\", ID: {shelf.get_id()}") shelves[shelf.get_id()] = shelf @@ -202,7 +244,9 @@ for shelf_data in api_get_listing('shelves'): if shelf_details.get('books') is None: continue for book_data in shelf_details.get('books'): - book = Node(book_data.get('name'), shelf, book_data.get('id')) + + last_edit_timestamp: datetime = api_timestamp_string_to_datetime(book_data['updated_at']) + book = Node(book_data.get('name'), shelf, book_data.get('id'), last_edit_timestamp) debug(f"Book: \"{book.get_name()}\", ID: {book.get_id()}") books[book.get_id()] = book @@ -211,16 +255,23 @@ info("Getting info about Books not belonging to any shelf") for book_data in api_get_listing('books'): if book_data.get('id') in books.keys(): continue - book = Node(book_data.get('name'), None, book_data.get('id')) - debug(f"Book: \"{book.get_name()}\", ID: {book.get_id()}") + + last_edit_timestamp: datetime = api_timestamp_string_to_datetime(book_data['updated_at']) + book = Node(book_data.get('name'), None, book_data.get('id'), last_edit_timestamp) + + debug(f"Book: \"{book.get_name()}\", ID: {book.get_id()}, last edit: {book.get_last_edit_timestamp()}") info(f"Book \"{book.get_name()} has no shelf assigned.\"") books[book.get_id()] = book info("Getting info about Chapters") for chapter_data in api_get_listing('chapters'): - chapter = Node(chapter_data.get('name'), books.get(chapter_data.get('book_id')), chapter_data.get('id')) - debug(f"Chapter: \"{chapter.get_name()}\", ID: {chapter.get_id()}") + last_edit_timestamp: datetime = api_timestamp_string_to_datetime(chapter_data['updated_at']) + chapter = Node(chapter_data.get('name'), + books.get(chapter_data.get('book_id')), + chapter_data.get('id'), + last_edit_timestamp) + debug(f"Chapter: \"{chapter.get_name()}\", ID: {chapter.get_id()}, last edit: {chapter.get_last_edit_timestamp()}") chapters[chapter.get_id()] = chapter info("Getting info about Pages") @@ -228,18 +279,22 @@ info("Getting info about Pages") for page_data in api_get_listing('pages'): parent_id = page_data.get('chapter_id') + last_edit_timestamp: datetime = api_timestamp_string_to_datetime(page_data['updated_at']) + if parent_id not in chapters.keys(): - parent_id = page_data.get('book_id') - info(f"Page \"{page_data.get('name')}\" is not in any chapter, " - f"using Book \"{books.get(parent_id).get_name()}\" as a parent.") - page = Node(page_data.get('name'), books.get(parent_id), page_data.get('id')) - debug(f"Page: \"{page.get_name()}\", ID: {page.get_id()}") + parent = books.get(page_data.get('book_id')) + page = Node(page_data.get('name'), parent, page_data.get('id'), last_edit_timestamp) + + info(f"Page \"{page.get_name()}\" is not in any chapter, " + f"using Book \"{parent.get_name()}\" as a parent.") + + debug(f"Page: \"{page.get_name()}\", ID: {page.get_id()}, last edit: {page.get_last_edit_timestamp()}") pages[page.get_id()] = page pages_not_in_chapter[page.get_id()] = page continue - page = Node(page_data.get('name'), chapters.get(parent_id), page_data.get('id')) - debug(f"Page: \"{page.get_name()}\", ID: {page.get_id()}") + page = Node(page_data.get('name'), chapters.get(parent_id), page_data.get('id'), last_edit_timestamp) + debug(f"Page: \"{page.get_name()}\", ID: {page.get_id()}, last edit: {page.get_last_edit_timestamp()}") pages[page.get_id()] = page files: list[Node] = [] From 947c25d34d7d55b21be5aa5c0b84f1ac595b5629 Mon Sep 17 00:00:00 2001 From: Maciej Lebiest Date: Wed, 27 Jul 2022 11:49:07 +0200 Subject: [PATCH 04/10] fix level and format parameters when more than one given --- README.md | 31 +++++++++++++------------------ exporter.py | 33 +++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index e1031fe..0d5fa78 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,9 @@ Full example on how to use the script: ```bash python exporter.py \ -H https://wiki.example.com \ - -f pdf,md,plaintext,html \ - -l pages,chapters,books + -f pdf md plaintext html \ + -l pages chapters books \ + --force-update-files \ -t ./token.txt \ -V debug \ -p ./ @@ -29,25 +30,19 @@ Customization: ```text options: -p PATH, --path PATH Path where exported files will be placed. - Default: . -t TOKEN_FILE, --token-file TOKEN_FILE File containing authorization token in format TOKEN_ID:TOKEN_SECRET - Default: ./token.txt -H HOST, --host HOST Your domain with protocol prefix, example: https://example.com - Default: https://localhost - -f FORMATS, --formats FORMATS - Coma separated list of formats to use for export. - Available ones: markdown,plaintext,pdf,html - default: markdown - -l LEVEL, --level LEVEL - Coma separated list of levels at which should be export performed. - Available levels: ['pages', 'chapters', 'books'] - Default: pages - -V LOG_LEVEL, --log-level LOG_LEVEL - Set verbosity level. - Available levels: dict_keys(['debug', 'info', 'warning', 'error']) - Default: info - + -f {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...], + --formats {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...] + Space separated list of formats to use for export. + -l {pages,chapters,books} [{pages,chapters,books} ...], --level {pages,chapters,books} [{pages,chapters,books} ...] + Space separated list of levels at which should be export performed. + --force-update-files Set this option to skip checking local files timestamps against remote last edit + timestamps.This will cause overwriting local files, even if they seem to be already in + newest version. + -V {debug,info,warning,error}, --log-level {debug,info,warning,error} + Set verbosity level. ``` ### TODO: diff --git a/exporter.py b/exporter.py index f704b62..69d97b7 100644 --- a/exporter.py +++ b/exporter.py @@ -37,22 +37,23 @@ parser.add_argument('-t', '--token-file', type=str, default=f'.{os.path.sep}toke help='File containing authorization token in format TOKEN_ID:TOKEN_SECRET') parser.add_argument('-H', '--host', type=str, default='https://localhost', help='Your domain with protocol prefix, example: https://example.com') -parser.add_argument('-f', '--formats', type=str, default='markdown', - help=f'Coma separated list of formats to use for export.', choices=FORMATS.keys()) -parser.add_argument('-l', '--level', type=str, default='pages', - help=f'Coma separated list of levels at which should be export performed. ', choices=LEVELS) +parser.add_argument('-f', '--formats', type=str, default=['markdown'], nargs="+", + help=f'Space separated list of formats to use for export.', choices=FORMATS.keys()) +parser.add_argument('-l', '--level', type=str, default=['pages'], nargs="+", + help=f'Space separated list of levels at which should be export performed. ', choices=LEVELS) +parser.add_argument('--force-update-files', action='store_true', + help="Set this option to skip checking local files timestamps against remote last edit timestamps." + "This will cause overwriting local files, even if they seem to be already in newest version.") +parser.set_defaults(force_update_files=False) parser.add_argument('-V', '--log-level', type=str, default='info', help=f'Set verbosity level. ', choices=LOG_LEVEL.keys()) args = parser.parse_args() -if args.log_level not in LOG_LEVEL.keys(): - error(f"Bad log level {args.log_level}, available levels: {LOG_LEVEL.keys()}") - exit(1) - logging.basicConfig(format='%(levelname)s :: %(message)s', level=LOG_LEVEL.get(args.log_level)) -formats = args.formats.split(',') +formats: list[str] = args.formats + for frmt in formats: if frmt not in FORMATS.keys(): error("Unknown format name (NOT file extension), " @@ -61,7 +62,7 @@ for frmt in formats: API_PREFIX: str = f"{args.host.removesuffix(os.path.sep)}/api" FS_PATH: str = args.path.removesuffix(os.path.sep) -LEVEL_CHOICE: list[str] = args.level.split(',') +LEVEL_CHOICE: list[str] = args.level for lvl in LEVEL_CHOICE: if lvl not in LEVELS: error(f"Level {lvl} is not supported, can be only one of {LEVELS}") @@ -72,6 +73,7 @@ with open(args.token_file, 'r') as f: HEADERS = {'Content-Type': 'application/json; charset=utf-8', 'Authorization': f"Token {TOKEN}"} +SKIP_TIMESTAMPS: bool = args.force_update_files class Node: @@ -194,6 +196,10 @@ def api_get_listing(path: str) -> list: def check_if_update_needed(file_path: str, document: Node) -> bool: + if SKIP_TIMESTAMPS: + return True + debug(f"Checking for update for file {file_path}") + if not os.path.exists(file_path): debug(f"Document {file_path} is missing on disk, update needed.") return True @@ -205,10 +211,10 @@ def check_if_update_needed(file_path: str, document: Node) -> bool: changes: int = document.changed_since(local_last_edit) if changes > 0: - info(f"Document \"{document.get_name()}\" consists of {changes} outdated documents, update needed.") + info(f"Document \"{file_path}\" consists of {changes} outdated documents, update needed.") return True - debug(f"Document \"{document.get_name()}\" consists of {changes} outdated documents.") + debug(f"Document \"{file_path}\" consists of {changes} outdated documents, skipping updating.") return False @@ -218,9 +224,8 @@ def export(documents: list[Node], level: str): for frmt in formats: path: str = f"{FS_PATH}{os.path.sep}{document.get_path()}{os.path.sep}{document.get_name()}.{FORMATS[frmt]}" - debug(f"Checking for update for file {path}") + if not check_if_update_needed(path, document): - debug("Already updated") continue data: bytes = api_get_bytes(f'{level}/{document.get_id()}/export/{frmt}') From 60053ed3fc01b17f1d69a8df84ae3f8a02bcefec Mon Sep 17 00:00:00 2001 From: Szwendacz99 Date: Wed, 27 Jul 2022 11:56:39 +0200 Subject: [PATCH 05/10] update readme --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0d5fa78..6af8c17 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ Customizable script for exporting notes from BookStack through API - export keeping the tree structure by making folders from Shelves, Books and Chapters - export multiple formats at once - export at multiple levels at once (export Books or/and Chapters or/and Pages as files) +- choose if local files should be updated only if their edit timestamp is older than remote document last edit, or timestamps should be ignored and files will always be overwritten with the newest version - customizable path for placing exported notes - authorization token is loaded from txt file @@ -46,7 +47,7 @@ options: ``` ### TODO: -- ~~choosing verbosity level through command line parameter~~ Done -- ~~choosing on what level should the notes be exported (Books, Chapters, Pages)~~ Done -- WIP: choosing if update note file only if the last edit timestamp from API is later that the local file timestamp -- suggestions? +- [x] ~~choosing verbosity level through command line parameter~~ Done +- [x] ~~choosing on what level should the notes be exported (Books, Chapters, Pages)~~ Done +- [x] ~~choosing if update note file only if the last edit timestamp from API is later that the local file timestamp~~ Done +- [ ] suggestions? From 5139a487608b8c4fcdc23bcc33a7192e5996f76d Mon Sep 17 00:00:00 2001 From: Szwendacz99 Date: Wed, 27 Jul 2022 12:02:06 +0200 Subject: [PATCH 06/10] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6af8c17..b289f21 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Full example on how to use the script: ```bash python exporter.py \ -H https://wiki.example.com \ - -f pdf md plaintext html \ + -f pdf markdown plaintext html \ -l pages chapters books \ --force-update-files \ -t ./token.txt \ From 50fe9b747687bf5469e686031f9f4bb9fb8d286f Mon Sep 17 00:00:00 2001 From: Maciej Lebiest <> Date: Wed, 8 Feb 2023 11:13:47 +0100 Subject: [PATCH 07/10] Add possibility to replace any characters in filename with "_" char. --- README.md | 4 + exporter.py | 230 ++++++++++++++++++++++++++++++++++------------------ 2 files changed, 153 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index b289f21..c40f916 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ Customizable script for exporting notes from BookStack through API - export at multiple levels at once (export Books or/and Chapters or/and Pages as files) - choose if local files should be updated only if their edit timestamp is older than remote document last edit, or timestamps should be ignored and files will always be overwritten with the newest version - customizable path for placing exported notes +- configure replacing any characters in filenames with "_" for any filesystem compatibility - authorization token is loaded from txt file Requirements: @@ -21,6 +22,7 @@ python exporter.py \ -H https://wiki.example.com \ -f pdf markdown plaintext html \ -l pages chapters books \ + -c "/" "#" \ --force-update-files \ -t ./token.txt \ -V debug \ @@ -37,6 +39,8 @@ options: -f {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...], --formats {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...] Space separated list of formats to use for export. + -c FORBIDDEN_CHARS [FORBIDDEN_CHARS ...], --forbidden-chars FORBIDDEN_CHARS [FORBIDDEN_CHARS ...] + Space separated list of symbols to be replaced with "_" in filenames. -l {pages,chapters,books} [{pages,chapters,books} ...], --level {pages,chapters,books} [{pages,chapters,books} ...] Space separated list of levels at which should be export performed. --force-update-files Set this option to skip checking local files timestamps against remote last edit diff --git a/exporter.py b/exporter.py index 69d97b7..448ef72 100644 --- a/exporter.py +++ b/exporter.py @@ -5,6 +5,7 @@ import os from datetime import datetime from logging import info, error, debug from pathlib import Path +import sys from typing import Union from urllib.request import urlopen, Request import urllib.parse @@ -17,11 +18,7 @@ FORMATS: dict['str', 'str'] = { 'html': 'html' } -LEVELS = [ - 'pages', - 'chapters', - 'books' -] +LEVELS = ['pages', 'chapters', 'books'] LOG_LEVEL: dict = { 'debug': logging.DEBUG, @@ -30,35 +27,77 @@ LOG_LEVEL: dict = { 'error': logging.ERROR } +# Characters in filenames to be replaced with "_" +FORBIDDEN_CHARS: list[str] = ["/", "#"] + parser = argparse.ArgumentParser(description='BookStack exporter') -parser.add_argument('-p', '--path', type=str, default='.', +parser.add_argument('-p', + '--path', + type=str, + default='.', help='Path where exported files will be placed.') -parser.add_argument('-t', '--token-file', type=str, default=f'.{os.path.sep}token.txt', - help='File containing authorization token in format TOKEN_ID:TOKEN_SECRET') -parser.add_argument('-H', '--host', type=str, default='https://localhost', - help='Your domain with protocol prefix, example: https://example.com') -parser.add_argument('-f', '--formats', type=str, default=['markdown'], nargs="+", - help=f'Space separated list of formats to use for export.', choices=FORMATS.keys()) -parser.add_argument('-l', '--level', type=str, default=['pages'], nargs="+", - help=f'Space separated list of levels at which should be export performed. ', choices=LEVELS) -parser.add_argument('--force-update-files', action='store_true', - help="Set this option to skip checking local files timestamps against remote last edit timestamps." - "This will cause overwriting local files, even if they seem to be already in newest version.") +parser.add_argument( + '-t', + '--token-file', + type=str, + default=f'.{os.path.sep}token.txt', + help='File containing authorization token in format TOKEN_ID:TOKEN_SECRET') +parser.add_argument( + '-H', + '--host', + type=str, + default='https://localhost', + help='Your domain with protocol prefix, example: https://example.com') +parser.add_argument('-f', + '--formats', + type=str, + default=['markdown'], + nargs="+", + help='Space separated list of formats to use for export.', + choices=FORMATS.keys()) +parser.add_argument('-c', + '--forbidden-chars', + type=str, + default=FORBIDDEN_CHARS, + nargs="+", + help='Space separated list of symbols to be replaced ' + 'with "_" in filenames.') +parser.add_argument( + '-l', + '--level', + type=str, + default=['pages'], + nargs="+", + help="Space separated list of levels at which should be export " + "performed. ", + choices=LEVELS) +parser.add_argument( + '--force-update-files', + action='store_true', + help="Set this option to skip checking local files timestamps against " + "remote last edit timestamps. This will cause overwriting local files," + " even if they seem to be already in newest version.") parser.set_defaults(force_update_files=False) -parser.add_argument('-V', '--log-level', type=str, default='info', - help=f'Set verbosity level. ', choices=LOG_LEVEL.keys()) +parser.add_argument('-V', + '--log-level', + type=str, + default='info', + help='Set verbosity level.', + choices=LOG_LEVEL.keys()) args = parser.parse_args() -logging.basicConfig(format='%(levelname)s :: %(message)s', level=LOG_LEVEL.get(args.log_level)) +logging.basicConfig(format='%(levelname)s :: %(message)s', + level=LOG_LEVEL.get(args.log_level)) formats: list[str] = args.formats +FORBIDDEN_CHARS = args.forbidden_chars for frmt in formats: - if frmt not in FORMATS.keys(): + if frmt not in FORMATS: error("Unknown format name (NOT file extension), " "check api docs for current version of your BookStack") - exit(1) + sys.exit(1) API_PREFIX: str = f"{args.host.removesuffix(os.path.sep)}/api" FS_PATH: str = args.path.removesuffix(os.path.sep) @@ -66,21 +105,25 @@ LEVEL_CHOICE: list[str] = args.level for lvl in LEVEL_CHOICE: if lvl not in LEVELS: error(f"Level {lvl} is not supported, can be only one of {LEVELS}") - exit(1) + sys.exit(1) -with open(args.token_file, 'r') as f: +with open(args.token_file, 'r', encoding='utf-8') as f: TOKEN: str = f.readline().removesuffix('\n') -HEADERS = {'Content-Type': 'application/json; charset=utf-8', - 'Authorization': f"Token {TOKEN}"} +HEADERS = { + 'Content-Type': 'application/json; charset=utf-8', + 'Authorization': f"Token {TOKEN}" +} SKIP_TIMESTAMPS: bool = args.force_update_files class Node: - def __init__(self, name: str, - parent: Union['Node', None], - node_id: int, + """Clas representing any node in whole bookstack documents "tree".""" + + def __init__(self, name: str, parent: Union['Node', None], node_id: int, last_edit_timestamp: datetime): + for char in FORBIDDEN_CHARS: + name = name.replace(char, "_") self.__name: str = name self.__children: list['Node'] = [] @@ -91,10 +134,12 @@ class Node: self.__last_edit_timestamp: datetime = last_edit_timestamp self.__node_id = node_id - def get_name(self) -> str: + @property + def name(self) -> str: return self.__name - def get_parent(self) -> Union['Node', None]: + @property + def parent(self) -> Union['Node', None]: return self.__parent def changed_since(self, timestamp: datetime) -> int: @@ -124,7 +169,7 @@ class Node: def get_path(self) -> str: if self.__parent is None: return "." - return self.__parent.get_path() + os.path.sep + self.__parent.get_name() + return self.__parent.get_path() + os.path.sep + self.__parent.name def get_id(self) -> int: return self.__node_id @@ -163,20 +208,21 @@ def api_get_bytes(path: str, **kwargs) -> bytes: with urlopen(request) as response: if response.status == 403: error("403 Forbidden, check your token!") - exit(response.status) + sys.exit(response.status) return response.read() def api_get_dict(path: str) -> dict: + """Make api request at specified path and return result as dict.""" data = api_get_bytes(path).decode() return json.loads(data) def api_get_listing(path: str) -> list: - """ - function for retrieving whole lists through api, it will - request for another 50 until have collected "total" amount + """Retrieve whole lists through api. + + Request for another 50 until have collected "total" amount. :param path: :return: """ @@ -186,9 +232,10 @@ def api_get_listing(path: str) -> list: result: list = [] while total > len(result): - data: dict = json.loads(api_get_bytes(path, count=count, offset=len(result))) - total = data.get('total') - result += data.get('data') + data: dict = json.loads( + api_get_bytes(path, count=count, offset=len(result))) + total = data['total'] + result += data['data'] debug(f"API listing got {total} items out of maximum {count}") @@ -196,6 +243,7 @@ def api_get_listing(path: str) -> list: def check_if_update_needed(file_path: str, document: Node) -> bool: + """Check if a Node need updating on disk, according to timestamps.""" if SKIP_TIMESTAMPS: return True debug(f"Checking for update for file {file_path}") @@ -203,80 +251,95 @@ def check_if_update_needed(file_path: str, document: Node) -> bool: if not os.path.exists(file_path): debug(f"Document {file_path} is missing on disk, update needed.") return True - local_last_edit: datetime = datetime.utcfromtimestamp(os.path.getmtime(file_path)) + local_last_edit: datetime = datetime.utcfromtimestamp( + os.path.getmtime(file_path)) remote_last_edit: datetime = document.get_last_edit_timestamp() - debug(f"Local file creation timestamp: {local_last_edit.date()} {local_last_edit.time()}, " - f"remote edit timestamp: {remote_last_edit.date()} {remote_last_edit.time()}") + debug("Local file creation timestamp: " + f"{local_last_edit.date()} {local_last_edit.time()}, " + "remote edit timestamp: " + f"{remote_last_edit.date()} {remote_last_edit.time()}") changes: int = document.changed_since(local_last_edit) if changes > 0: - info(f"Document \"{file_path}\" consists of {changes} outdated documents, update needed.") + info(f"Document \"{file_path}\" consists of {changes} " + "outdated documents, update needed.") return True - debug(f"Document \"{file_path}\" consists of {changes} outdated documents, skipping updating.") + debug(f"Document \"{file_path}\" consists of {changes} " + "outdated documents, skipping updating.") return False def export(documents: list[Node], level: str): + """Save Node to file.""" for document in documents: make_dir(f"{FS_PATH}{os.path.sep}{document.get_path()}") - for frmt in formats: - path: str = f"{FS_PATH}{os.path.sep}{document.get_path()}{os.path.sep}{document.get_name()}.{FORMATS[frmt]}" + for v_format in formats: + path: str = f"{FS_PATH}{os.path.sep}{document.get_path()}" + \ + f"{os.path.sep}{document.name}.{FORMATS[v_format]}" if not check_if_update_needed(path, document): continue - data: bytes = api_get_bytes(f'{level}/{document.get_id()}/export/{frmt}') - with open(path, 'wb') as f: + data: bytes = api_get_bytes( + f'{level}/{document.get_id()}/export/{v_format}') + with open(path, 'wb') as file: info(f"Saving {path}") - f.write(data) + file.write(data) info("Getting info about Shelves and their Books") for shelf_data in api_get_listing('shelves'): - last_edit_timestamp: datetime = api_timestamp_string_to_datetime(shelf_data['updated_at']) - shelf = Node(shelf_data.get('name'), None, shelf_data.get('id'), last_edit_timestamp) + last_edit_ts: datetime = api_timestamp_string_to_datetime( + shelf_data['updated_at']) + shelf = Node(shelf_data.get('name'), None, shelf_data.get('id'), + last_edit_ts) - debug(f"Shelf: \"{shelf.get_name()}\", ID: {shelf.get_id()}") + debug(f"Shelf: \"{shelf.name}\", ID: {shelf.get_id()}") shelves[shelf.get_id()] = shelf shelf_details = api_get_dict(f'shelves/{shelf.get_id()}') if shelf_details.get('books') is None: continue - for book_data in shelf_details.get('books'): + for book_data in shelf_details['books']: - last_edit_timestamp: datetime = api_timestamp_string_to_datetime(book_data['updated_at']) - book = Node(book_data.get('name'), shelf, book_data.get('id'), last_edit_timestamp) - debug(f"Book: \"{book.get_name()}\", ID: {book.get_id()}") + last_edit_ts: datetime = api_timestamp_string_to_datetime( + book_data['updated_at']) + book = Node(book_data.get('name'), shelf, book_data.get('id'), + last_edit_ts) + debug(f"Book: \"{book.name}\", ID: {book.get_id()}") books[book.get_id()] = book info("Getting info about Books not belonging to any shelf") for book_data in api_get_listing('books'): - if book_data.get('id') in books.keys(): + if book_data.get('id') in books: continue - last_edit_timestamp: datetime = api_timestamp_string_to_datetime(book_data['updated_at']) - book = Node(book_data.get('name'), None, book_data.get('id'), last_edit_timestamp) + last_edit_ts: datetime = api_timestamp_string_to_datetime( + book_data['updated_at']) + book = Node(book_data.get('name'), None, book_data.get('id'), last_edit_ts) - debug(f"Book: \"{book.get_name()}\", ID: {book.get_id()}, last edit: {book.get_last_edit_timestamp()}") - info(f"Book \"{book.get_name()} has no shelf assigned.\"") + debug(f"Book: \"{book.name}\", ID: {book.get_id()}, " + f"last edit: {book.get_last_edit_timestamp()}") + info(f"Book \"{book.name} has no shelf assigned.\"") books[book.get_id()] = book info("Getting info about Chapters") for chapter_data in api_get_listing('chapters'): - last_edit_timestamp: datetime = api_timestamp_string_to_datetime(chapter_data['updated_at']) + last_edit_ts: datetime = api_timestamp_string_to_datetime( + chapter_data['updated_at']) chapter = Node(chapter_data.get('name'), books.get(chapter_data.get('book_id')), - chapter_data.get('id'), - last_edit_timestamp) - debug(f"Chapter: \"{chapter.get_name()}\", ID: {chapter.get_id()}, last edit: {chapter.get_last_edit_timestamp()}") + chapter_data.get('id'), last_edit_ts) + debug(f"Chapter: \"{chapter.name}\", ID: {chapter.get_id()}," + f" last edit: {chapter.get_last_edit_timestamp()}") chapters[chapter.get_id()] = chapter info("Getting info about Pages") @@ -284,38 +347,43 @@ info("Getting info about Pages") for page_data in api_get_listing('pages'): parent_id = page_data.get('chapter_id') - last_edit_timestamp: datetime = api_timestamp_string_to_datetime(page_data['updated_at']) + last_edit_ts: datetime = api_timestamp_string_to_datetime( + page_data['updated_at']) - if parent_id not in chapters.keys(): - parent = books.get(page_data.get('book_id')) - page = Node(page_data.get('name'), parent, page_data.get('id'), last_edit_timestamp) + if parent_id not in chapters: + parent = books[page_data['book_id']] + page = Node(page_data.get('name'), parent, page_data.get('id'), + last_edit_ts) - info(f"Page \"{page.get_name()}\" is not in any chapter, " - f"using Book \"{parent.get_name()}\" as a parent.") + info(f"Page \"{page.name}\" is not in any chapter, " + f"using Book \"{parent.name}\" as a parent.") - debug(f"Page: \"{page.get_name()}\", ID: {page.get_id()}, last edit: {page.get_last_edit_timestamp()}") + debug(f"Page: \"{page.name}\", ID: {page.get_id()}," + f" last edit: {page.get_last_edit_timestamp()}") pages[page.get_id()] = page pages_not_in_chapter[page.get_id()] = page continue - page = Node(page_data.get('name'), chapters.get(parent_id), page_data.get('id'), last_edit_timestamp) - debug(f"Page: \"{page.get_name()}\", ID: {page.get_id()}, last edit: {page.get_last_edit_timestamp()}") + page = Node(page_data.get('name'), chapters.get(parent_id), + page_data.get('id'), last_edit_ts) + debug(f"Page: \"{page.name}\", ID: {page.get_id()}, " + f"last edit: {page.get_last_edit_timestamp()}") pages[page.get_id()] = page files: list[Node] = [] -export_pages_not_in_chapter: bool = False +EXPORT_PAGES_NOT_IN_CHAPTER: bool = False for lvl in LEVEL_CHOICE: if lvl == 'pages': - files = pages.values() + files = list(pages.values()) elif lvl == 'chapters': - files = chapters.values() - export_pages_not_in_chapter = True + files = list(chapters.values()) + EXPORT_PAGES_NOT_IN_CHAPTER = True elif lvl == 'books': - files = books.values() + files = list(books.values()) export(files, lvl) -if export_pages_not_in_chapter: +if EXPORT_PAGES_NOT_IN_CHAPTER: info("Exporting pages that are not in chapter...") - export(pages_not_in_chapter.values(), 'pages') + export(list(pages_not_in_chapter.values()), 'pages') From acf8b71e5ba7257261c55ba7c7d940738025f994 Mon Sep 17 00:00:00 2001 From: Maciej Lebiest Date: Sun, 16 Apr 2023 10:33:29 +0200 Subject: [PATCH 08/10] Add HTTP headers customization parameters. Fix for #3 --- README.md | 26 ++++++++++++++----- exporter.py | 73 +++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 74 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index c40f916..7c01d0d 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,8 @@ Customizable script for exporting notes from BookStack through API - customizable path for placing exported notes - configure replacing any characters in filenames with "_" for any filesystem compatibility - authorization token is loaded from txt file +- Set custom HTTP User-Agent header to bypass filtering based on that header (like in CloudFlare tunnels) +- Set arbitrary custom headers through parameter Requirements: - Python at least in version 3.6 @@ -26,26 +28,38 @@ python exporter.py \ --force-update-files \ -t ./token.txt \ -V debug \ - -p ./ + -p ./ \ + --user-agent "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0" + --additional-headers "Header1: value1" "Header2: value2" ``` Customization: ```text options: + -h, --help show this help message and exit -p PATH, --path PATH Path where exported files will be placed. -t TOKEN_FILE, --token-file TOKEN_FILE File containing authorization token in format TOKEN_ID:TOKEN_SECRET -H HOST, --host HOST Your domain with protocol prefix, example: https://example.com -f {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...], - --formats {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...] + --formats {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...] Space separated list of formats to use for export. -c FORBIDDEN_CHARS [FORBIDDEN_CHARS ...], --forbidden-chars FORBIDDEN_CHARS [FORBIDDEN_CHARS ...] Space separated list of symbols to be replaced with "_" in filenames. - -l {pages,chapters,books} [{pages,chapters,books} ...], --level {pages,chapters,books} [{pages,chapters,books} ...] + -u USER_AGENT, --user-agent USER_AGENT + User agent header content. In situations where requests are blocked + because of bad client/unrecognized web browser/etc (like with CloudFlare tunnels), + change to some typical web browser user agent header. + --additional-headers ADDITIONAL_HEADERS [ADDITIONAL_HEADERS ...] + List of arbitrary additional HTTP headers to be sent with every HTTP request. + They can override default ones, including Authorization header. + Example: -u "Header1: value1" "Header2": value2 + -l {pages,chapters,books} [{pages,chapters,books} ...], + --level {pages,chapters,books} [{pages,chapters,books} ...] Space separated list of levels at which should be export performed. - --force-update-files Set this option to skip checking local files timestamps against remote last edit - timestamps.This will cause overwriting local files, even if they seem to be already in - newest version. + --force-update-files Set this option to skip checking local files timestamps against + remote last edit timestamps. This will cause overwriting local files, + even if they seem to be already in newest version. -V {debug,info,warning,error}, --log-level {debug,info,warning,error} Set verbosity level. ``` diff --git a/exporter.py b/exporter.py index 448ef72..5269601 100644 --- a/exporter.py +++ b/exporter.py @@ -6,12 +6,12 @@ from datetime import datetime from logging import info, error, debug from pathlib import Path import sys -from typing import Union +from typing import Dict, List, Union from urllib.request import urlopen, Request import urllib.parse # (formatName, fileExtension) -FORMATS: dict['str', 'str'] = { +FORMATS: Dict['str', 'str'] = { 'markdown': 'md', 'plaintext': 'txt', 'pdf': 'pdf', @@ -20,7 +20,7 @@ FORMATS: dict['str', 'str'] = { LEVELS = ['pages', 'chapters', 'books'] -LOG_LEVEL: dict = { +LOG_LEVEL: Dict = { 'debug': logging.DEBUG, 'info': logging.INFO, 'warning': logging.WARNING, @@ -28,7 +28,7 @@ LOG_LEVEL: dict = { } # Characters in filenames to be replaced with "_" -FORBIDDEN_CHARS: list[str] = ["/", "#"] +FORBIDDEN_CHARS: List[str] = ["/", "#"] parser = argparse.ArgumentParser(description='BookStack exporter') parser.add_argument('-p', @@ -62,6 +62,23 @@ parser.add_argument('-c', nargs="+", help='Space separated list of symbols to be replaced ' 'with "_" in filenames.') +parser.add_argument('-u', + '--user-agent', + type=str, + default="BookStack exporter", + help='User agent header content. In situations' + ' where requests are blocked because of bad client/' + 'unrecognized web browser/etc (like with CloudFlare' + ' tunnels), change to some typical ' + 'web browser user agent header.') +parser.add_argument('--additional-headers', + type=str, + nargs="+", + default=[], + help='List of arbitrary additional HTTP headers to be ' + 'sent with every HTTP request. They can override default' + ' ones, including Authorization header. ' + 'Example: -u "Header1: value1" "Header2": value2') parser.add_argument( '-l', '--level', @@ -87,10 +104,18 @@ parser.add_argument('-V', args = parser.parse_args() + +def removesuffix(text, suffix): + """Remove suffix from text if matched.""" + if text.endswith(suffix): + return text[:len(text) - len(suffix)] + return text + + logging.basicConfig(format='%(levelname)s :: %(message)s', level=LOG_LEVEL.get(args.log_level)) -formats: list[str] = args.formats +formats: List[str] = args.formats FORBIDDEN_CHARS = args.forbidden_chars for frmt in formats: @@ -99,21 +124,28 @@ for frmt in formats: "check api docs for current version of your BookStack") sys.exit(1) -API_PREFIX: str = f"{args.host.removesuffix(os.path.sep)}/api" -FS_PATH: str = args.path.removesuffix(os.path.sep) -LEVEL_CHOICE: list[str] = args.level +API_PREFIX: str = f"{removesuffix(args.host, os.path.sep)}/api" +FS_PATH: str = removesuffix(args.path, os.path.sep) +LEVEL_CHOICE: List[str] = args.level for lvl in LEVEL_CHOICE: if lvl not in LEVELS: error(f"Level {lvl} is not supported, can be only one of {LEVELS}") sys.exit(1) with open(args.token_file, 'r', encoding='utf-8') as f: - TOKEN: str = f.readline().removesuffix('\n') + TOKEN: str = removesuffix(f.readline(), '\n') HEADERS = { 'Content-Type': 'application/json; charset=utf-8', - 'Authorization': f"Token {TOKEN}" + 'Authorization': f"Token {TOKEN}", + 'User-Agent': args.user_agent } +for header in args.additional_headers: + values = header.split(':', 1) + if len(values) < 2: + raise ValueError(f"Improper HTTP header specification: {header}") + HEADERS[values[0]] = values[1] + SKIP_TIMESTAMPS: bool = args.force_update_files @@ -125,7 +157,7 @@ class Node: for char in FORBIDDEN_CHARS: name = name.replace(char, "_") self.__name: str = name - self.__children: list['Node'] = [] + self.__children: List['Node'] = [] self.__parent: Union['Node', None] = parent if parent is not None: @@ -136,15 +168,18 @@ class Node: @property def name(self) -> str: + """Return name of this Shelf/Book/Chapter/Page.""" return self.__name @property def parent(self) -> Union['Node', None]: + """Return parent Node or None if there isn't any.""" return self.__parent def changed_since(self, timestamp: datetime) -> int: """ - Check if remote version have changed after given timestamp, including its children + Check if remote version have changed after given timestamp, + including its children :param timestamp: :return: amount of changed documents at level of this document Node """ @@ -175,11 +210,11 @@ class Node: return self.__node_id -shelves: dict[int, Node] = {} -books: dict[int, Node] = {} -chapters: dict[int, Node] = {} -pages: dict[int, Node] = {} -pages_not_in_chapter: dict[int, Node] = {} +shelves: Dict[int, Node] = {} +books: Dict[int, Node] = {} +chapters: Dict[int, Node] = {} +pages: Dict[int, Node] = {} +pages_not_in_chapter: Dict[int, Node] = {} def api_timestamp_string_to_datetime(timestamp: str) -> datetime: @@ -271,7 +306,7 @@ def check_if_update_needed(file_path: str, document: Node) -> bool: return False -def export(documents: list[Node], level: str): +def export(documents: List[Node], level: str): """Save Node to file.""" for document in documents: make_dir(f"{FS_PATH}{os.path.sep}{document.get_path()}") @@ -370,7 +405,7 @@ for page_data in api_get_listing('pages'): f"last edit: {page.get_last_edit_timestamp()}") pages[page.get_id()] = page -files: list[Node] = [] +files: List[Node] = [] EXPORT_PAGES_NOT_IN_CHAPTER: bool = False for lvl in LEVEL_CHOICE: From cf1bb98dbd0d224bdd8abd69799a2afd95390e3f Mon Sep 17 00:00:00 2001 From: Maciej Lebiest Date: Sun, 31 Dec 2023 16:20:27 +0100 Subject: [PATCH 09/10] Add attachments exporting, fix for #4 --- README.md | 45 ++++++++++++++------- exporter.py | 113 ++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 136 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 7c01d0d..1bcaa07 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Customizable script for exporting notes from BookStack through API #### Features: -- export keeping the tree structure by making folders from Shelves, Books and Chapters +- export keeping the tree structure by making folders from Shelves, Books, Chapters and attachments (including attachments from external links) - export multiple formats at once - export at multiple levels at once (export Books or/and Chapters or/and Pages as files) - choose if local files should be updated only if their edit timestamp is older than remote document last edit, or timestamps should be ignored and files will always be overwritten with the newest version @@ -35,31 +35,47 @@ python exporter.py \ Customization: ```text +usage: exporter.py [-h] [-p PATH] [-t TOKEN_FILE] [-H HOST] + [-f {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...]] + [-c FORBIDDEN_CHARS [FORBIDDEN_CHARS ...]] [-u USER_AGENT] + [--additional-headers ADDITIONAL_HEADERS [ADDITIONAL_HEADERS ...]] + [-l {pages,chapters,books} [{pages,chapters,books} ...]] + [--force-update-files] [--dont-export-attachments] + [--dont-export-external-attachments] [-V {debug,info,warning,error}] + +BookStack exporter + options: -h, --help show this help message and exit -p PATH, --path PATH Path where exported files will be placed. -t TOKEN_FILE, --token-file TOKEN_FILE File containing authorization token in format TOKEN_ID:TOKEN_SECRET -H HOST, --host HOST Your domain with protocol prefix, example: https://example.com - -f {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...], - --formats {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...] + -f {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...], --formats {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...] Space separated list of formats to use for export. -c FORBIDDEN_CHARS [FORBIDDEN_CHARS ...], --forbidden-chars FORBIDDEN_CHARS [FORBIDDEN_CHARS ...] Space separated list of symbols to be replaced with "_" in filenames. -u USER_AGENT, --user-agent USER_AGENT - User agent header content. In situations where requests are blocked - because of bad client/unrecognized web browser/etc (like with CloudFlare tunnels), - change to some typical web browser user agent header. + User agent header content. In situations where requests are blocked + because of bad client/unrecognized web browser/etc (like with + CloudFlare tunnels), change to some typical web browser user agent + header. --additional-headers ADDITIONAL_HEADERS [ADDITIONAL_HEADERS ...] - List of arbitrary additional HTTP headers to be sent with every HTTP request. - They can override default ones, including Authorization header. - Example: -u "Header1: value1" "Header2": value2 - -l {pages,chapters,books} [{pages,chapters,books} ...], - --level {pages,chapters,books} [{pages,chapters,books} ...] + List of arbitrary additional HTTP headers to be sent with every HTTP + request. They can override default ones, including Authorization + header. IMPORTANT: these headers are also sent when downloading + external attachments! Don't put here any private data.Example: -u + "Header1: value1" "Header2: value2" + -l {pages,chapters,books} [{pages,chapters,books} ...], --level {pages,chapters,books} [{pages,chapters,books} ...] Space separated list of levels at which should be export performed. - --force-update-files Set this option to skip checking local files timestamps against - remote last edit timestamps. This will cause overwriting local files, - even if they seem to be already in newest version. + --force-update-files Set this option to skip checking local files timestamps against remote + last edit timestamps. This will cause overwriting local files, even if + they seem to be already in newest version. + --dont-export-attachments + Set this to prevent exporting attachments that were uploaded to + BookStack. + --dont-export-external-attachments + Set this to prevent exporting external attachments (from links). -V {debug,info,warning,error}, --log-level {debug,info,warning,error} Set verbosity level. ``` @@ -68,4 +84,5 @@ options: - [x] ~~choosing verbosity level through command line parameter~~ Done - [x] ~~choosing on what level should the notes be exported (Books, Chapters, Pages)~~ Done - [x] ~~choosing if update note file only if the last edit timestamp from API is later that the local file timestamp~~ Done +- [x] ~~exporting attachments~~ - [ ] suggestions? diff --git a/exporter.py b/exporter.py index 5269601..dc186f5 100644 --- a/exporter.py +++ b/exporter.py @@ -9,6 +9,7 @@ import sys from typing import Dict, List, Union from urllib.request import urlopen, Request import urllib.parse +import base64 # (formatName, fileExtension) FORMATS: Dict['str', 'str'] = { @@ -77,8 +78,10 @@ parser.add_argument('--additional-headers', default=[], help='List of arbitrary additional HTTP headers to be ' 'sent with every HTTP request. They can override default' - ' ones, including Authorization header. ' - 'Example: -u "Header1: value1" "Header2": value2') + ' ones, including Authorization header. IMPORTANT: ' + 'these headers are also sent when downloading external ' + 'attachments! Don\'t put here any private data.' + 'Example: -u "Header1: value1" "Header2: value2"') parser.add_argument( '-l', '--level', @@ -94,7 +97,19 @@ parser.add_argument( help="Set this option to skip checking local files timestamps against " "remote last edit timestamps. This will cause overwriting local files," " even if they seem to be already in newest version.") +parser.add_argument( + '--dont-export-attachments', + action='store_true', + help= + "Set this to prevent exporting attachments that were uploaded to BookStack." +) +parser.add_argument( + '--dont-export-external-attachments', + action='store_true', + help="Set this to prevent exporting external attachments (from links).") parser.set_defaults(force_update_files=False) +parser.set_defaults(dont_export_attachments=False) +parser.set_defaults(dont_export_external_attachments=False) parser.add_argument('-V', '--log-level', type=str, @@ -140,11 +155,17 @@ HEADERS = { 'Authorization': f"Token {TOKEN}", 'User-Agent': args.user_agent } +HEADERS_NO_TOKEN = { + 'Content-Type': 'application/json; charset=utf-8', + 'User-Agent': args.user_agent +} + for header in args.additional_headers: values = header.split(':', 1) if len(values) < 2: raise ValueError(f"Improper HTTP header specification: {header}") HEADERS[values[0]] = values[1] + HEADERS_NO_TOKEN[values[0]] = values[1] SKIP_TIMESTAMPS: bool = args.force_update_files @@ -215,6 +236,7 @@ books: Dict[int, Node] = {} chapters: Dict[int, Node] = {} pages: Dict[int, Node] = {} pages_not_in_chapter: Dict[int, Node] = {} +attachments: Dict[int, Node] = {} def api_timestamp_string_to_datetime(timestamp: str) -> datetime: @@ -272,7 +294,7 @@ def api_get_listing(path: str) -> list: total = data['total'] result += data['data'] - debug(f"API listing got {total} items out of maximum {count}") + debug(f"API listing got {len(result)} items out of maximum {count}") return result @@ -286,7 +308,7 @@ def check_if_update_needed(file_path: str, document: Node) -> bool: if not os.path.exists(file_path): debug(f"Document {file_path} is missing on disk, update needed.") return True - local_last_edit: datetime = datetime.utcfromtimestamp( + local_last_edit: datetime = datetime.fromtimestamp( os.path.getmtime(file_path)) remote_last_edit: datetime = document.get_last_edit_timestamp() @@ -306,8 +328,8 @@ def check_if_update_needed(file_path: str, document: Node) -> bool: return False -def export(documents: List[Node], level: str): - """Save Node to file.""" +def export_doc(documents: List[Node], level: str): + """Save document-like Nodes to files.""" for document in documents: make_dir(f"{FS_PATH}{os.path.sep}{document.get_path()}") @@ -325,6 +347,55 @@ def export(documents: List[Node], level: str): file.write(data) +def export_attachments(attachments: List[Node]): + """Save attachment Nodes to files.""" + for attachment in attachments: + + base_path = attachment.get_path() + if attachment.parent is None: + base_path = f'__ATTACHMENTS_FROM_DELETED_PAGES__{os.path.sep}{base_path}' + + make_dir(f"{FS_PATH}{os.path.sep}{base_path}") + + path: str = f"{FS_PATH}{os.path.sep}{base_path}" + \ + f"{os.path.sep}{attachment.name}" + + if not check_if_update_needed(path, attachment): + continue + + data = api_get_bytes(f'attachments/{attachment.get_id()}') + data = json.loads(data) + content = data['content'] + content_url = urllib.parse.urlparse(content) + + if content_url.scheme: + if args.dont_export_external_attachments: + continue + info(f"Downloading attachment from url: {content_url.geturl()}") + request: Request = Request(content_url.geturl(), + headers=HEADERS_NO_TOKEN) + + with urlopen(request) as response: + if response.status >= 300: + error( + "Could not download link-type attachment from " + f"'{content_url.geturl()}, got code {response.status}'!" + ) + sys.exit(response.status) + + with open(path, 'wb') as file: + info(f"Saving {path}") + file.write(response.read()) + else: + with open(path, 'wb') as file: + info(f"Saving {path}") + file.write(base64.b64decode(content)) + + +######################### +# Gathering data from api +######################### + info("Getting info about Shelves and their Books") for shelf_data in api_get_listing('shelves'): @@ -405,6 +476,26 @@ for page_data in api_get_listing('pages'): f"last edit: {page.get_last_edit_timestamp()}") pages[page.get_id()] = page +if not args.dont_export_attachments: + info("Getting info about Attachments.") + + for attachment_data in api_get_listing('attachments'): + last_edit_ts: datetime = api_timestamp_string_to_datetime( + attachment_data['updated_at']) + all_pages = {} + all_pages.update(pages) + all_pages.update(pages_not_in_chapter) + attachment = Node(attachment_data.get('name'), + all_pages.get(attachment_data.get('uploaded_to')), + attachment_data.get('id'), last_edit_ts) + debug(f"Attachment: \"{attachment.name}\", ID: {attachment.get_id()}," + f" last edit: {attachment.get_last_edit_timestamp()}") + attachments[attachment.get_id()] = attachment + +######################### +# Exporting data from api +######################### + files: List[Node] = [] EXPORT_PAGES_NOT_IN_CHAPTER: bool = False @@ -417,8 +508,14 @@ for lvl in LEVEL_CHOICE: elif lvl == 'books': files = list(books.values()) - export(files, lvl) + export_doc(files, lvl) if EXPORT_PAGES_NOT_IN_CHAPTER: info("Exporting pages that are not in chapter...") - export(list(pages_not_in_chapter.values()), 'pages') + export_doc(list(pages_not_in_chapter.values()), 'pages') + +if not args.dont_export_attachments: + export_attachments(list(attachments.values())) + +info("Finished") +sys.exit(0) From 1fa56298dab779fa65fd759b98d887a4cb1f2a60 Mon Sep 17 00:00:00 2001 From: Maciej Lebiest Date: Tue, 2 Jan 2024 15:02:34 +0100 Subject: [PATCH 10/10] Add api request rate limiting, fix for #5 --- README.md | 58 +++++++++++++++++++++++++++++++++-------------------- exporter.py | 42 ++++++++++++++++++++++++++++++++------ 2 files changed, 72 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 1bcaa07..a784b14 100644 --- a/README.md +++ b/README.md @@ -18,12 +18,13 @@ Requirements: Full example on how to use the script: 1. Clone the repo 2. next to the script place token.txt file containing token id and token secret in format: TOKEN_ID:TOKEN_SECRET -3. in the same directory run the command, specifying your app domain with https prefix (every parameter is optional as it have default value, this is a full possible example): +3. in the same directory run the command, specifying your app domain with https prefix (every parameter is optional as it have default value, this is an example): ```bash python exporter.py \ -H https://wiki.example.com \ -f pdf markdown plaintext html \ -l pages chapters books \ + --rate-limit 180 \ -c "/" "#" \ --force-update-files \ -t ./token.txt \ @@ -37,45 +38,57 @@ Customization: ```text usage: exporter.py [-h] [-p PATH] [-t TOKEN_FILE] [-H HOST] [-f {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...]] + [--rate-limit RATE_LIMIT] [-c FORBIDDEN_CHARS [FORBIDDEN_CHARS ...]] [-u USER_AGENT] [--additional-headers ADDITIONAL_HEADERS [ADDITIONAL_HEADERS ...]] [-l {pages,chapters,books} [{pages,chapters,books} ...]] [--force-update-files] [--dont-export-attachments] - [--dont-export-external-attachments] [-V {debug,info,warning,error}] + [--dont-export-external-attachments] + [-V {debug,info,warning,error}] BookStack exporter -options: +optional arguments: -h, --help show this help message and exit -p PATH, --path PATH Path where exported files will be placed. -t TOKEN_FILE, --token-file TOKEN_FILE - File containing authorization token in format TOKEN_ID:TOKEN_SECRET - -H HOST, --host HOST Your domain with protocol prefix, example: https://example.com + File containing authorization token in format + TOKEN_ID:TOKEN_SECRET + -H HOST, --host HOST Your domain with protocol prefix, example: + https://example.com -f {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...], --formats {markdown,plaintext,pdf,html} [{markdown,plaintext,pdf,html} ...] Space separated list of formats to use for export. + --rate-limit RATE_LIMIT + How many api requests can be made in a minute. Default + is 180 (BookStack defaults) -c FORBIDDEN_CHARS [FORBIDDEN_CHARS ...], --forbidden-chars FORBIDDEN_CHARS [FORBIDDEN_CHARS ...] - Space separated list of symbols to be replaced with "_" in filenames. + Space separated list of symbols to be replaced with + "_" in filenames. -u USER_AGENT, --user-agent USER_AGENT - User agent header content. In situations where requests are blocked - because of bad client/unrecognized web browser/etc (like with - CloudFlare tunnels), change to some typical web browser user agent - header. + User agent header content. In situations where + requests are blocked because of bad + client/unrecognized web browser/etc (like with + CloudFlare tunnels), change to some typical web + browser user agent header. --additional-headers ADDITIONAL_HEADERS [ADDITIONAL_HEADERS ...] - List of arbitrary additional HTTP headers to be sent with every HTTP - request. They can override default ones, including Authorization - header. IMPORTANT: these headers are also sent when downloading - external attachments! Don't put here any private data.Example: -u - "Header1: value1" "Header2: value2" + List of arbitrary additional HTTP headers to be sent + with every HTTP request. They can override default + ones, including Authorization header. IMPORTANT: these + headers are also sent when downloading external + attachments! Don't put here any private data.Example: + -u "Header1: value1" "Header2: value2" -l {pages,chapters,books} [{pages,chapters,books} ...], --level {pages,chapters,books} [{pages,chapters,books} ...] - Space separated list of levels at which should be export performed. - --force-update-files Set this option to skip checking local files timestamps against remote - last edit timestamps. This will cause overwriting local files, even if - they seem to be already in newest version. + Space separated list of levels at which should be + export performed. + --force-update-files Set this option to skip checking local files + timestamps against remote last edit timestamps. This + will cause overwriting local files, even if they seem + to be already in newest version. --dont-export-attachments - Set this to prevent exporting attachments that were uploaded to - BookStack. + Set this to prevent exporting any attachments. --dont-export-external-attachments - Set this to prevent exporting external attachments (from links). + Set this to prevent exporting external attachments + (from links). -V {debug,info,warning,error}, --log-level {debug,info,warning,error} Set verbosity level. ``` @@ -85,4 +98,5 @@ options: - [x] ~~choosing on what level should the notes be exported (Books, Chapters, Pages)~~ Done - [x] ~~choosing if update note file only if the last edit timestamp from API is later that the local file timestamp~~ Done - [x] ~~exporting attachments~~ +- [x] ~~api rate limiting~~ - [ ] suggestions? diff --git a/exporter.py b/exporter.py index dc186f5..090834d 100644 --- a/exporter.py +++ b/exporter.py @@ -10,6 +10,8 @@ from typing import Dict, List, Union from urllib.request import urlopen, Request import urllib.parse import base64 +from time import time +from time import sleep # (formatName, fileExtension) FORMATS: Dict['str', 'str'] = { @@ -56,6 +58,11 @@ parser.add_argument('-f', nargs="+", help='Space separated list of formats to use for export.', choices=FORMATS.keys()) +parser.add_argument('--rate-limit', + type=int, + default=180, + help='How many api requests can be made in a minute. ' + 'Default is 180 (BookStack defaults)') parser.add_argument('-c', '--forbidden-chars', type=str, @@ -97,12 +104,9 @@ parser.add_argument( help="Set this option to skip checking local files timestamps against " "remote last edit timestamps. This will cause overwriting local files," " even if they seem to be already in newest version.") -parser.add_argument( - '--dont-export-attachments', - action='store_true', - help= - "Set this to prevent exporting attachments that were uploaded to BookStack." -) +parser.add_argument('--dont-export-attachments', + action='store_true', + help="Set this to prevent exporting any attachments.") parser.add_argument( '--dont-export-external-attachments', action='store_true', @@ -170,6 +174,31 @@ for header in args.additional_headers: SKIP_TIMESTAMPS: bool = args.force_update_files +class ApiRateLimiter: + + def __init__(self, rate_limit: int) -> None: + self.__rate_limit = rate_limit + info(f"API rate limit: {self.__rate_limit}/min") + self.__requests_times: List[float] = [] + + def limit_rate_request(self): + """Count another request and wait minimal required time if limit is reached.""" + current_time = time() + self.__requests_times.append(current_time) + # filter out requests older than 60s ago + self.__requests_times = list( + filter(lambda x: current_time - x <= 60, self.__requests_times)) + + # sleep until oldest remembered request is more than 60s ago + if len(self.__requests_times) > self.__rate_limit: + wait_time = self.__requests_times[0] + 60 - current_time + info(f"API Rate limit reached, waiting {round(wait_time, 2)}s") + sleep(wait_time) + + +api_rate_limiter = ApiRateLimiter(args.rate_limit) + + class Node: """Clas representing any node in whole bookstack documents "tree".""" @@ -262,6 +291,7 @@ def api_get_bytes(path: str, **kwargs) -> bytes: request: Request = Request(request_path, headers=HEADERS) + api_rate_limiter.limit_rate_request() with urlopen(request) as response: if response.status == 403: error("403 Forbidden, check your token!")