# pyblosxom2leonardo.py
# Copyright (c) 2005 Tim Wegener
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject
# to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
"""Convert a PyBlosxom blog to Leonardo LFS format.

This almost works, but needs to update ldb/property_index


Usage:
python pyblosxom2leonardo.py <old_blog_dir> <old_comment_dir> <lfs_blog_dir>

This requires Python > 2.3 (for os.walk).
If you want a backport, let me know.

This script is available under the MIT License.

"""

__author__ = 'Tim Wegener <twegener@fastmail.fm> http://www.madabar.com'


import re
import os
import sys
import time
import shelve
import fnmatch
import xml.dom.minidom


def get_element_text(dom, name):
    """Return the node text for a DOM element."""

    element = dom.getElementsByTagName(name)[0]
    if not element or element.firstChild is None:
        result = ''
    else:
        result = element.firstChild.nodeValue
    return result


def pyblosxom2leonardo(old_blog_dir, old_comment_dir, lfs_blog_dir,
                       extension='.txt', use_last=True, quiet=False):
    """Convert a pyblosxom blog data store to a Leonardo LFS datastore.

    Arguments:
    old_blog_dir -- pyblosxom blog data directory
    old_comment_dir -- pyblosxom comment data directory
    lfs_blog_dir -- blog subdirectory underneath lfs directory
    extension -- extension of blog entry filenames
                 (default: '.txt')
    use_last -- only use deepest subdirectory of blog entry for category
                otherwise, use each directory component as a category
                (default: False)
    quiet -- if set don't output informative messages
             (default: False)
    
    """
    # Read PyBlosxom tree.
    base_len = len(old_blog_dir.split(os.sep))

    # todo: local or gmt?
    time2tuple = time.localtime
    
    for dirpath, dirnames, filenames in os.walk(old_blog_dir):

        dir_parts = dirpath.split(os.sep)[base_len:]

        # Get comment directory.
        comment_dir = os.path.join(old_comment_dir, *dir_parts)

        # Grab categories from dirname.
        categories = dir_parts

        if use_last:
            categories = categories[-1:]

        entry_filenames = fnmatch.filter(filenames, '*%s' % extension)
        for filename in entry_filenames:
            # Read old PyBlosxom data.
            
            # Grab blog filename for LFS dirname.
            path = os.path.join(dirpath, filename)

            if not quiet:
                print 'entry filename:', path

            # Grab timestamp from blog file.
            mtime = os.path.getmtime(path)
            ctime = os.path.getctime(path)
    
            # Grab title from first line of blog file.
            f = open(path, 'r')
            page_title = f.readline().strip()

            # Grab content from the rest of the file.
            body = f.read()
            f.close()

            # Write new Leonardo LFS data.

            # Create LFS dirs based on date.
            # Note that month and day are zero padded to two digits!
            time_tuple = time2tuple(ctime)
            year = '%04d' % time_tuple[0]
            month = '%02d' % time_tuple[1]
            day = '%02d' % time_tuple[2]
            lfs_dir_date_part = os.path.join(*(year, month, day))

            lfs_date_dir = os.path.join(lfs_blog_dir, lfs_dir_date_part)
            
            # Create LFS .ldv subdir based on blog file name.
            entry_name = os.path.splitext(filename)[0]
            ldv_dir = os.path.join(lfs_date_dir, '%s.ldv' % entry_name)
            os.makedirs(ldv_dir)
            
            # Create property_db
            property_db = shelve.open(os.path.join(ldv_dir, 'property_db'))
            property_db['page_title'] = page_title
            property_db['categories'] = categories
            property_db['allow_comments'] = 'YES'
            property_db['allow_trackbacks'] = 'YES'
            property_db['creation_time'] = ctime
            property_db['last_modified'] = mtime
            property_db.close()

            # Create __content__.xhtml file from blog file body.
            ldv_content_filename = os.path.join(ldv_dir, '__content__.xhtml')
            f_out = open(ldv_content_filename, 'w')
            f_out.write(body)
            f_out.close()

            # Comments
            if not os.path.exists(comment_dir):
                continue

            comment_files = os.listdir(comment_dir)
            comments = []
            
            for filename in comment_files:

                comment_match = re.match(r'%s-([\d\.]+).cmt' % entry_name,
                                         filename)
                if not comment_match:
                    continue

                comment_path = os.path.join(comment_dir, filename)

                if not quiet:
                    print 'comment filename:', comment_path
                
                timestamp = float(comment_match.group(1))
                
                # Parse the comment XML data.
                # Note: The <source> element is ignored.
                comment = {}
                dom = xml.dom.minidom.parse(open(comment_path))
                comment['comment_title'] = get_element_text(dom, 'title')
                comment['author_name'] = get_element_text(dom, 'author')
                comment['author_link'] = get_element_text(dom, 'link')
                comment['creation_time'] = float(
                    get_element_text(dom, 'pubDate'))
                comment['last_modified'] = os.path.getmtime(comment_path)

                body = get_element_text(dom, 'description')

                comments.append((comment, body))

            # Write out the LDV dat.
            # (Need to sort on creation time.)
            def cmp_creation_time(a, b):

                return cmp(a[0]['creation_time'], b[0]['creation_time'])

            comments.sort(cmp_creation_time)

            comment_i = 0  # Leonardo comments start at 1.
            for comment, body in comments:
                comment_i += 1

                # Create comment directory.
                comment_ldv_dir = os.path.join(ldv_dir,
                                               '__comment__%d' % comment_i)
                os.mkdir(comment_ldv_dir)
                
                # Create content.
                # todo: It would be nice if Leonardo support xhtml comments.

                # For now, convert tags to text.
                body = body.replace('&amp;', '&')
                body = body.replace('&quot;', "'")
                # Expose links as text.
##                 body = re.sub(r'<a\s[^>]*?href="([^"]*?)"[^>]*?>([^<]*?)</a>',
##                               r'\2 [ \1 ]',
##                               body,
##                               re.MULTILINE|re.DOTALL)
                body = re.sub(r'<br\s*\\?>', '\n', body,
                              re.MULTILINE|re.DOTALL)
                # Remove any unsupported tags.
                body = re.sub(r'<[^>]*?>', '', body,
                              re.MULTILINE|re.DOTALL)
                body = re.sub(r'&\w{3,4};', '', body,
                              re.MULTILINE|re.DOTALL)

                # Write out content.
                f = open(os.path.join(comment_ldv_dir, '__content__.txt'), 'w')
                f.write(body)
                f.close()
                
                # Create property_db.
                property_db = shelve.open(os.path.join(comment_ldv_dir,
                                                       'property_db'))
                property_db.update(comment)
                property_db.close()
                

def main():

    # Usage
    if len(sys.argv) != 4:
        sys.stdout.write(__doc__)
        sys.exit(2)

    # Get PyBlosxom blog data directory.
    old_blog_dir = sys.argv[1]
    old_comment_dir = sys.argv[2]
    # Get Leonardo blog data directory.
    lfs_blog_dir = sys.argv[3]

    pyblosxom2leonardo(old_blog_dir, old_comment_dir, lfs_blog_dir)


if __name__ == '__main__':
    main()