#!/usr/bin/env python
import sys
import mwxml
import os
import re

def sanitize_filename(s):
    """Remove or replace characters that are not allowed in filenames."""
    # Replace invalid characters with an underscore and trim to a safe length.
    s = re.sub(r'[\/\\:*?"<>|]', '_', s)
    return s.strip()[:100]

def save_to_txt(title, author, date, text):
    # Construct a safe filename using title, author, and date.
    filename = f"{sanitize_filename(title)}_{sanitize_filename(author)}_{date}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(text)

def main():
    if len(sys.argv) != 2:
        print("Usage: python wikidump2txt.py <xml file>")
        sys.exit(1)

    xml_file = sys.argv[1]
    
    # Open and parse the dump using mwxml.
    dump = mwxml.Dump.from_file(open(xml_file, 'rb'))
    
    total_pages = 0
    for page in dump.pages:
        total_pages += 1

        # Get the last revision (assumed to be the most recent) for the page.
        last_revision = None
        for revision in page:
            last_revision = revision
        if last_revision is None:
            continue

        title = page.title
        text = last_revision.text or ""
        # Use contributor username if available, otherwise "Unknown".
        author = (last_revision.contributor.username 
                  if last_revision.contributor and hasattr(last_revision.contributor, 'username')
                  else "Unknown")
        # Format the timestamp; if it's not set, use "UnknownDate".
        date = (last_revision.timestamp.strftime("%Y-%m-%d") 
                if last_revision.timestamp and hasattr(last_revision.timestamp, "strftime")
                else "UnknownDate")

        print("Title:", title)
        print("Author:", author)
        print("Date:", date)
        print("Text snippet:", text[:200] + ("..." if len(text) > 200 else ""))
        print("=" * 80)
        
        # Save the full text to a file.
        save_to_txt(title, author, date, text)
    
    print("Total pages:", total_pages)

if __name__ == "__main__":
    main()
