Closure Library 向けの ATOM パーサーを作りました

ちょっとした理由で ATOM フィードを解析する Closure Library ベースのクラスを作ってみましたので、本日はそれをご紹介したいと思います。

まあ、あまり需要があるとも思えませんが、 GData API のアクセスなんかにも使えるので、 Closure Library で Google (OpenSocial) ガジェットや Chrome 拡張なんかを作るときは便利だと思います。

ソースコード

まずはソースコードです。 Google Code Project Hosting にアップロードしてあります。

http://webos-goodies.googlecode.com/svn/trunk/prod...

このファイルを Closure Library アプリケーションのソースツリーの中（depswriter.py が走査するディレクトリの中）にコピーして deps.js を再構築すれば、以下の goog.require() 呼び出しで使えるようになります。

goog.require('atomreader.Feed');

内容はこんな感じになってます。

goog.provide('atomparser');
goog.provide('atomparser.Feed');
goog.provide('atomparser.Entry');
goog.provide('atomparser.Error');
goog.require('goog.array');
goog.require('goog.date');
goog.require('goog.debug.Error');

/**
 * Collects the text of text nodes in the element
 * @param {Element} element The element collects the text from.
 * @return {string} The text of the element.
 */
atomparser.collectsText = function(element) {
  var texts = [], nodes = element.childNodes, node;
  for(var i = 0, l = nodes.length ; i < l ; ++i) {
	node = nodes[i];
	if(node.nodeType == 3 || node.nodeType == 4) {
	  texts[texts.length] = node.data || '';
	}
  }
  return texts.join('');
}

/**
 * Joins two string with '|' separator.
 * @param {string} str1 The first string.
 * @param {string} str2 The second string.
 * @return The joined string.
 */
atomparser.joinStr = function(str1, str2) {
  return (str1 || '').replace('|', '%7C') + '|' + (str2 || '');
}

/**
 * An Abstract parser.
 * @constructor
 * @param {!Element} rootNode The target of parsing.
 */
atomparser.Base = function(rootNode) {
  this.rootNode_   = rootNode;
  this.children_   = {};
  this.id          = '';
  this.published   = null;
  this.updated     = null;
  this.title       = '';
  this.summary     = '';
  this.authorName  = '';
  this.authorEMail = '';
  this.link_       = {};
  this.entries_    = [];

  var children = this.children_;
  var nodes    = this.rootNode_.childNodes;
  var node, nodeName, arr, index;
  for(var i = 0 , l = nodes.length ; i < l ; ++i) {
	node = nodes[i];
	if(node.nodeType != 1) {
	  continue;
	}

	nodeName        = node.localName || node.baseName;
	index           = atomparser.joinStr(node.namespaceURI, nodeName);
	children[index] = arr = children[index] || [];
	arr[arr.length] = node;
	if(node.namespaceURI != atomparser.Base.NAMESPACE_URI) {
	  continue;
	}

	if(nodeName == 'id') {
	  this.id = atomparser.collectsText(node);
	} else if(nodeName == 'published') {
	  this.published = goog.date.fromIsoString(atomparser.collectsText(node));
	} else if(nodeName == 'updated') {
	  this.updated = goog.date.fromIsoString(atomparser.collectsText(node));
	} else if(nodeName == 'title') {
	  this.title = atomparser.collectsText(node);
	} else if(nodeName == 'summary') {
	  this.summary = atomparser.collectsText(node);
	} else if(nodeName == 'link') {
	  index = atomparser.joinStr(node.getAttribute('rel'), node.getAttribute('type'));
	  this.link_[index] = node.getAttribute('href');
	} else if(nodeName == 'author') {
	  this.parseAuthor_(node);
	} else if(nodeName == 'entry') {
	  this.entries_[this.entries_.length] = new atomparser.Entry(node);
	}
  }
};

/**
 * The namespace URI for ATOM feed.
 * @type {string}
 * @const
 */
atomparser.Base.NAMESPACE_URI = 'http://www.w3.org/2005/Atom';

/**
 * The root node of the feed.
 * @type {Element}
 * @private
 */
atomparser.Base.prototype.rootNode_;

/**
 * Child elements of the root node.
 * @type {Object.<string, Element>}
 * @private
 */
atomparser.Base.prototype.children_;

/**
 * The value of id node.
 * @type {string}
 */
atomparser.Base.prototype.id;

/**
 * The value of published node.
 * @type {goog.date.DateTime}
 */
atomparser.Base.prototype.published;

/**
 * The value of updated node.
 * @type {goog.date.DateTime}
 */
atomparser.Base.prototype.updated;

/**
 * The value of title node.
 * @type {string}
 */
atomparser.Base.prototype.title;

/**
 * The value of summary node.
 * @type {string}
 */
atomparser.Base.prototype.summary;

/**
 * The value of author/name node.
 * @type {string}
 */
atomparser.Base.prototype.authorName;

/**
 * The value of author/email node.
 * @type {string}
 */
atomparser.Base.prototype.authorName;

/**
 * The map of the value of link nodes.
 * @type {Object.<string, string>}
 * @private
 */
atomparser.Base.prototype.link_;

/**
 * The array of entries.
 * @type {Array.<atomparser.Entry>}
 * @private
 */
atomparser.Base.prototype.entries_;

/**
 * Parse the author node.
 * @param {Element} authorNode The author node.
 * @private
 */
atomparser.Base.prototype.parseAuthor_ = function(authorNode) {
  var nodes = authorNode.childNodes;
  var node, nodeName;
  for(var i = 0 , l = nodes.length ; i < l ; ++i) {
	node = nodes[i];
	if(node.nodeType == 1 && node.namespaceURI == atomparser.Base.NAMESPACE_URI) {
	  nodeName = node.localName || node.baseName;
	  if(nodeName == 'name') {
		this.authorName = atomparser.collectsText(node);
	  } else if(nodeName == 'email') {
		this.authorEMail = atomparser.collectsText(node);
	  }
	}
  }
};

/**
 * Returns elements that have the specific name and namespace.
 * @param {string} name The element name.
 * @param {string=} opt_namespace
 *     The namespace of the element.
 *     Defaults to atomparser.Base.NAMESPACE_URI.
 * @return {Array.<Element>}
 *     The array of matched elements. If no element is matched,
 *     the return value is an empty array.
 */
atomparser.Base.prototype.getElements = function(name, opt_namespace) {
  opt_namespace = opt_namespace || atomparser.Base.NAMESPACE_URI;
  return this.children_[atomparser.joinStr(opt_namespace, name)] || [];
}

/**
 * Returns the href attribute of the specific link node.
 * @param {string} rel The value of rel attribute.
 * @param {string} type The value of type attribute.
 * @return {?string} The href value.
 */
atomparser.Base.prototype.getLink = function(rel, type) {
  return this.link_[atomparser.joinStr(rel, type)] || null;
};

/**
 * Returns the entry of the specified index.
 * @param {number} index The index of entry.
 * @return {?atomparser.Entry} If index is valid, the entry instance. Otherwise null.
 */
atomparser.Base.prototype.getEntry = function(index) {
  return this.entries_[index] || null;
};

/**
 * Returns the number of entries.
 * @return {number} The number of entries.
 */
atomparser.Base.prototype.getNumEntries = function(index) {
  return this.entries_.length;
};

/**
 * Calls a function for each entry.
 * @param {Function} f The function to call for every element.
 * @param {Object=} opt_obj The object to be used as the value of 'this'
 *     within f.
 */
atomparser.Base.prototype.forEachEntry = function(f, opt_obj) {
  goog.array.forEach(this.entries_, f, opt_obj);
};

/**
 * An entry of the ATOM feed.
 * @constructor
 * @param {Document|Element} rootNode The document of the feed or it's root node.
 */
atomparser.Feed = function(rootNode) {
  if(rootNode.nodeType != 1) {
	rootNode = rootNode.documentElement;
	if(!rootNode || (rootNode.localName || rootNode.baseName) != 'feed' ||
	   rootNode.namespaceURI != atomparser.Base.NAMESPACE_URI) {
	  throw new atomparser.Error('Not ATOM feed.');
	}
  }
  goog.base(this, rootNode);
};
goog.inherits(atomparser.Feed, atomparser.Base);

/**
 * An entry of an ATOM entry feed.
 * @constructor
 * @param {Document|Element} entryNode The document of the feed or it's root node.
 */
atomparser.Entry = function(entryNode) {
  if(entryNode.nodeType != 1) {
	entryNode = entryNode.documentElement;
	if(!entryNode || (entryNode.localName || entryNode.baseName) != 'entry' ||
	   entryNode.namespaceURI != atomparser.Base.NAMESPACE_URI) {
	  throw new atmo.Error('Not ATOM feed.');
	}
  }
  goog.base(this, entryNode);
};
goog.inherits(atomparser.Entry, atomparser.Base);

/**
 * An error class for atomparser.* class.
 * @constructor
 * @param {*=} opt_msg The message associated with the error.
 */
atomparser.Error = function(opt_msg) {
  goog.base(this, opt_msg);
};
goog.inherits(atomparser.Error, goog.debug.Error);

/**
 * The name of the error
 * @type {string}
 */
atomparser.Error.prototype.name = 'AtomParseError';

サンプル

百聞は一見にしかずということで、実際に動くサンプルを作りました。通常の Web ページでは ATOM フィードを XML の状態で扱うことはまずないと思うので、 Google Books で書籍検索を行う Google ガジェットです。テキストボックスに適当なキーワードを入れて検索すると、関連する書籍が表示されます。 Google ガジェットのプロキシを通しているせいか、英語圏扱いになってしまいますが・・・。

JavaScript 部分だけですが、ソースは以下のようになっています。

goog.provide('booksearchgadget.App');
goog.require('goog.dom');
goog.require('goog.events');
goog.require('atomparser.Feed');

booksearchgadget.App = function() {
  goog.events.listen(goog.dom.getElement('search-form'),
					 goog.events.EventType.SUBMIT,
					 booksearchgadget.App.onSubmit);
};
goog.addSingletonGetter(booksearchgadget.App);

booksearchgadget.App.onSubmit = function(e) {
  e.preventDefault();
  var text = goog.dom.getElement('text').value;
  var url  = 'http://books.google.com/books/feeds/volumes?q=' + _esc(text);
  _IG_FetchXmlContent(url, function(response) {
    var resultEl = goog.dom.getElement('result');
    goog.dom.removeChildren(resultEl);
    try {
      var feed = new atomparser.Feed(response);
      var ul   = goog.dom.createDom('ul');
      feed.forEachEntry(function(entry) {
        var url = entry.getLink('alternate', 'text/html');
        var el  = goog.dom.createDom(
          'li', null, goog.dom.createDom('a', { 'href': url }, entry.title));
        goog.dom.appendChild(ul, el);
      });
      goog.dom.appendChild(goog.dom.getElement('result'), ul);
    } catch(e) {
      goog.dom.setTextContent(goog.dom.getElement('result'), e);
    }
  });
}

booksearchgadget.App.getInstance();

atomreader 関連の部分だけ軽く説明すると、まず atomreader.Feed コンストラクタに ATOM フィードのドキュメントを渡して、インスタンスを作成します。

var feed = new atomparser.Feed(response);

そして、 forEachEntry() メソッドで各エントリを列挙しています。

feed.forEachEntry(function(entry) {
  // ...
});

あとは、エントリから必要な情報を取得するだけです。タイトル、更新日付など主な情報は単純にプロパティでアクセスできます。 link タグについては getLink() メソッドに rel と type の値を指定すれば、 href の値が取得できます。

var url = entry.getLink('alternate', 'text/html');
var el  = goog.dom.createDom(
  'li', null, goog.dom.createDom('a', { 'href': url }, entry.title));

だいたいこんな感じで使えます。 ATOM フィードを XPATH などを使って解析したことがあれば、だいぶ少ないコードで情報が取得できるのがわかるでしょう。とくに特定の link タグにメソッドひとつでアクセスできるのは楽だと思いますｗ

リファレンス

このライブラリでは、 ATOM フィード本体を保持する atomreader.Feed クラスと、個々のエントリを保持する atomreader.Entry クラスが定義されています。以下にそのプロパティやメソッドをまとめておきます。

Feed クラスのコンストラクタ

すでに説明済みですが、 Feed クラスのコンストラクタは以下の形式で呼び出します。

new atomreader.Feed(document_or_feed)

document_or_feed には ATOM フィードのドキュメントオブジェクトか、もしくはそのルート要素（feed タグ）を渡してください。するとその内容を解析し、プロパティやメソッドでアクセスできるようにします。ルート要素が feed タグでないドキュメントを渡すと例外を投げるので注意してください。

Feed クラスのプロパティ

代表的なタグの情報にはプロパティで簡単にアクセスできます。以下に通常利用できるはずのプロパティをまとめておきます。ただし、フィードに対応するタグがなければ空文字になるので注意してください。

名前	内容
id	<id>タグのテキスト。
updated	更新日時を示すgoog.date.DateTimeインスタンス。
title	タイトルの文字列。
authorName	フィード作者の名前。
authorEmail	フィード作者のメールアドレス。

これら以外の要素は、後述の getElements() メソッドで取得できます。

Feed クラスのメソッド

上記のプロパティに加えて、 Feed クラスには以下のメソッドがあります。主にプロパティになっていない情報にアクセスするために使います。

getElements(tagname[, namespace]): ルート要素の任意の子要素を取得するためのメソッドです。tagnameにプレフィクスなしのタグ名、namespaceにタグの名前空間 URL を指定すると、対応する要素の配列が返ります。名前空間を省略すると ATOM の名前空間 (http://www.w3.org/2005/Atom) とみなします。
getLink(rel, type): 特定の link タグの href 属性の値（文字列）を返します。 rel, type はそれぞれ取得したい link タグの属性値です。対応する link タグがフィードに存在しなければ null を返します。
getEntry(index): index で指定した番号の Entry インスタンスを返します。 index は 0 から始まり、フィードに出現する順に振られていきます。 index の値が範囲外であれば null が返ります。
getNumEntries(): フィードエントリの数を返します。
forEachEntry(f[, scope]): フィードの各エントリに対して順番に関数 f を呼び出します。引数は goog.array.forEach() と同じです。 scope が指定された場合は、それが this になります。

Entry クラスのコンストラクタ

Entry クラスはフィードの各エントリの情報を保持するクラスです。通常は Feed インスタンスのコンストラクタが暗黙的に作成しますが、 GData API で個別のエントリを取得した場合など、 entry タグがルートになっている場合は、この Entry インスタンスを明示的に生成する必要があります。

Entry インスタンスの生成方法は Feed の場合とほぼ同じです。

new atomreader.Entry(document_or_entry)

document_or_entry にはエントリフィードのドキュメント、もしくはそのルート要素を指定します。ルート要素が entry でなければ例外を投げます。

Entry クラスのプロパティ

Feed クラスとほとんど変わりませんが、 Entry クラスでは以下のプロパティが使えます。

名前	内容
id	<id>タグのテキスト。
published	公開日時を示すgoog.date.DateTimeインスタンス。
updated	更新日時を示すgoog.date.DateTimeインスタンス。
title	タイトルの文字列。
summary	本文の要約。
authorName	フィード作者の名前。
authorEmail	フィード作者のメールアドレス。

Entry クラスのメソッド

Entry クラスでは、 Feed クラスとまったく同じメソッドが使えます。とくに GData API にアクセスする場合、 GData 固有のタグにアクセスするために getElements() を多用するでしょう。例えば前述のサンプルで書籍の著者名（<dc:creator> タグ）にアクセスするには、以下のようにします。

var creators = entry.getElements('creator', 'http://purl.org/dc/terms');

返り値は XML の Element の配列です。そこからテキストを取得するには、後述の atomparser.collectsText() を使ってください。

ユーティリティ関数

単に内部で使っているものを public にしているだけですが、指定した要素の子供となっているすべてのテキストノード（および CDATA ノード）の文字列を結合したものを取得する、 atomparser.collectsText() 関数があります。例えば書籍の全著者名をコンマ区切りで文字列化するには、以下のようにします。

var creatorNames = goog.array.map(entry.getElements('creator', 'http://purl.org/dc/terms'), function(el) {
  return atomparser.collectsText(el);
}).join(',');

以上、本日は Closure Library ベースの ATOM フィード解析クラスをご紹介しました。本当は feedrenderer.js のように簡単にフィードの内容を表示するところまで作り込めると良かったのですが、そこまでは時間がありませんでした。今後の課題です。

2010年10月28日 06:50 | Comments() | TrackBack() |

この記事にコメントする

Recommendations

Books

「Closure Library」の入門書です。
詳しくはこちらの記事をどうぞ！

WebOS Goodies

WebOS の未来を模索する、ゲームプログラマあがりの Web 開発者のブログ。