iTunes Music to Data, via Python

Music Treemap

8000+ iTunes songs by genre and artist, colored by rating (ManyEyes version)

The track information stored in iTunes is pretty interesting from a visualization point of view, as it contains dates, durations, categories, groupings — all the sorts of things that make for complex, interesting data to look at.The only issue is … it’s in iTunes, and I’d like to get a CSV version of it so I can use it in a bunch of tools.

So, here is the result; a couple of Python scripts that use standard libraries to read the XML file exported by iTunes and convert it to CSV. It’s not general or robust code, just some script that worked for me and should be pretty easy to modify for you. I’m not a Pythonista, mostly doing Java, so apologies for non-idiomatic usage. Feel free to correct or suggest in the comments as this is also a learning exercise for me.

 '''
     MUSIC.PY
 '''
 import datetime, math
 EPOCH = datetime.datetime(1970,1,1)
 def remove_whitespace(node):
     to_kill = []
     # build list of nodes to remove WS from
     for child in node.childNodes:
         if child.nodeType == node.TEXT_NODE and not child.data.strip():
             to_kill.append(child)
         elif child.hasChildNodes():
             remove_whitespace(child)
     # Remove the items and unlink to save memory
     for node in to_kill:
         node.parentNode.removeChild(node)
         node.unlink()
def _isType(node, name):
     return node.hasChildNodes() and node.childNodes[0].nodeValue == name
def getItems(node, name):
     ch = node.childNodes
     items = [ ch[i] for i in xrange(1,len(ch)) if _isType(ch[i-1], name) ]
     return items[0].getElementsByTagName('dict')
def get_item(node, target):
     c = node.childNodes
     item = [ c[i+1] for i in xrange(0,len(c)-1) if _isType(c[i], target) ]
     if len(item) == 0:
         if target == 'Play Count':
             return 0
         return None
     result = _get_item_value(item[0])
     if target == 'Total Time':
         return math.floor(result / 1000)
     if result == None:
         print "No children for", target, " ... ", item[0]
     return result
def _get_item_value(item):
     ''' Convert the DOM item to a native type '''
     if not item.hasChildNodes():
         return None
     data = item.childNodes[0].nodeValue
     if item.nodeName == 'string':
         return data.replace('"', "'")
     if item.nodeName == 'integer':
         return int(data)
     # it's a date
     d = datetime.datetime.strptime(data, '%Y-%m-%dT%H:%M:%SZ')
     delta = d - EPOCH
     return delta.days

 '''
     MAIN.PY
 '''
 RC = "/Users/graham/Documents/data/music/Library.xml"
 TRACKS = "/Users/graham/Documents/data/music/LibraryTracks.csv"
TRACK_INFO = ["Track ID", "Name", "Artist", "Genre", "Total Time", "Year", "Date Added", "Play Count", "Rating", ]
from xml.dom import minidom
 from music import *
 import csv
srcfile = open(SRC)
 dom = minidom.parse(srcfile)
 remove_whitespace(dom)
srcfile.close()
main = dom.getElementsByTagName('plist')[0].childNodes[0]
 tracks = getItems(main, 'Tracks')
# Write out the values to a csv file
 c = csv.writer(open(TRACKS, "wb"))
 c.writerow(TRACK_INFO)
 for i in tracks:
     row = map(lambda x: get_item(i, x), TRACK_INFO)
     c.writerow(row)