Categories
Code

A Reddit Scraper Written in Google Apps Script

Reddit Scraper is a Google Script that pulls all posts from any reddit (subreddit) and saves the information in a Google sheet. The script extracts the post’s title, description, permalink and the posting date but can be easily extended to including user comments and thumbnail images as well.

The script runs through a background trigger every 5 minutes (configurable) and the trigger is automatically deleted once all the posts have been processed.


/* Reddit Scraper written by Amit Agarwal */
/* January 9, 2013 */

/* Replace LifeProTips with the Subreddit Name */
var REDDIT = "LifeProTips";

function run() {
  
   deleteTriggers_();
  
  /* Fetch Reddit posts every 5 minutes to avoid hitting 
     the reddit and Google Script quotas */
  ScriptApp.newTrigger("scrapReddit")
           .timeBased().everyMinutes(5).create();  
}


function scrapReddit() {
  
  // Process 20 Reddit posts in a batch
  var url = "http://www.reddit.com/r/" 
            + REDDIT + "/new.xml?limit=20" + getLastID_(); 

  // Reddit API returns the results in XML format  
  var response = UrlFetchApp.fetch(url);  
  var doc = XmlService.parse(response.getContentText()); 
  var entries = doc.getRootElement()
                   .getChildren('channel')[0].getChildren("item");
  
  var data = new Array();
    
  for (var i=0; i