Snag HITs.
// ==UserScript== // @name HIT Scraper (classic version) // @author Kerek // @description Snag HITs. // Based in part on code from mmmturkeybacon Export Mturk History and mmmturkeybacon Color Coded Search with Checkpoints // @namespace http://userscripts.org/users/536998 // @match https://www.mturk.com/mturk/findhits?match=true#hit_scraper* // @match https://www.mturk.com/mturk/findhits?match=true?hit_scraper* // @version 1.3.0.1 // @grant GM_xmlhttpRequest // @grant GM_getValue // @grant GM_setValue // @require http://code.jquery.com/jquery-latest.min.js // ==/UserScript== //alter the requester ignore last as you desire, just follow the format below and use EXACT capitalization e.g., CrowdSource var ignore_list = ["Oscar Smith", "Jon Brelig"]; //this searches extra pages if you skip too much, helps fill out results if you hit a chunk of ignored HITs. Change to true for this behavior. var correct_for_skips = false; //weight the four TO ratings for the coloring. Default has pay twice as important as fairness and nothing for communication and fast. var COMM_WEIGHT = 0; var PAY_WEIGHT = 10; var FAIR_WEIGHT = 5; var FAST_WEIGHT = 0; //display your hitdb records if applicable var check_hitDB = true; //default text size var default_text_size=11; var HITStorage = {}; var indexedDB = window.indexedDB || window.webkitIndexedDB || window.mozIndexedDB; window.IDBTransaction = window.IDBTransaction || window.webkitIDBTransaction || window.mozIDBTransaction; window.IDBKeyRange = window.IDBKeyRange || window.webkitIDBKeyRange || window.mozIDBKeyRange; HITStorage.IDBTransactionModes = { "READ_ONLY": "readonly", "READ_WRITE": "readwrite", "VERSION_CHANGE": "versionchange" }; var IDBKeyRange = window.IDBKeyRange; HITStorage.indexedDB = {}; HITStorage.indexedDB = {}; HITStorage.indexedDB.db = null; HITStorage.indexedDB.onerror = function(e) { console.log(e); }; var v=4; HITStorage.indexedDB.checkTitle = function(title,button) { var request = indexedDB.open("HITDB", v); request.onsuccess = function(e) { HITStorage.indexedDB.db = e.target.result; var db = HITStorage.indexedDB.db; if (!db.objectStoreNames.contains("HIT")) { db.close(); return; } var trans = db.transaction(["HIT"], HITStorage.IDBTransactionModes.READ_ONLY); var store = trans.objectStore("HIT"); var index = store.index("title"); index.get(title).onsuccess = function(event) { if (event.target.result === undefined) { console.log(title + ' not found'); history[button].titledb=false; } else { console.log(title + ' found'); history[button].titledb=true; } db.close(); }; }; request.onerror = HITStorage.indexedDB.onerror; }; HITStorage.indexedDB.checkRequester = function(id,button) { var request = indexedDB.open("HITDB", v); request.onsuccess = function(e) { HITStorage.indexedDB.db = e.target.result; var db = HITStorage.indexedDB.db; if (!db.objectStoreNames.contains("HIT")) { db.close(); return; } var trans = db.transaction(["HIT"], HITStorage.IDBTransactionModes.READ_ONLY); var store = trans.objectStore("HIT"); var index = store.index("requesterId"); index.get(id).onsuccess = function(event) { if (event.target.result === undefined) {history[button].reqdb=false; console.log(id + ' not found'); } else { history[button].reqdb=true; console.log(id + ' found'); } db.close(); }; }; request.onerror = HITStorage.indexedDB.onerror; }; var PAGES_TO_SCRAPE = 3; var MINIMUM_HITS = 100; var SEARCH_REFRESH=0; var URL_BASE = "/mturk/searchbar?searchWords=&selectedSearchType=hitgroups"; var initial_url = URL_BASE; var TO_REQ_URL = "http://turkopticon.ucsd.edu/reports?id="; var found_key_list=[]; var last_clear_time = new Date().getTime(); var searched_once = false; var save_new_results_time = 120; var save_results_time = 3600; var default_type = 0; var cur_loc = window.location.href; var time_input = document.createElement("INPUT"); time_input.value = 0; var page_input = document.createElement("INPUT"); page_input.value = 3; var min_input = document.createElement("INPUT"); var new_time_display_input = document.createElement("INPUT"); new_time_display_input.value = 300; var reward_input = document.createElement("INPUT"); var qual_input = document.createElement("INPUT"); qual_input.type = "checkbox"; qual_input.checked = true; var masters_input = document.createElement("INPUT"); masters_input.type = "checkbox"; var sort_input1 = document.createElement("INPUT"); sort_input1.type = "radio"; sort_input1.name = "sort_type"; sort_input1.value = "latest"; sort_input1.checked = true; var sort_input2 = document.createElement("INPUT"); sort_input2.type = "radio"; sort_input2.name = "sort_type"; sort_input2.value = "most"; var sort_input3 = document.createElement("INPUT"); sort_input3.type = "radio"; sort_input3.name = "sort_type"; sort_input3.value = "amount"; var search_input = document.createElement("INPUT"); var LINK_BASE = "https://www.mturk.com"; var BACKGROUND_COLOR = "rgb(19, 19, 19)"; var STATUSDETAIL_DELAY = 250; var MPRE_DELAY = 3000; var next_page = 1; var GREEN = '#66CC66'; // > 4 var LIGHTGREEN = '#ADFF2F'; // > 3 GREEN YELLOW var YELLOW = '#FFD700'; var ORANGE = '#FF9900'; // > 2 var RED = '#FF3030'; // <= 2 var BLUE = '#C0D9D9'; // no TO var GREY = 'lightGrey'; var BROWN = '#94704D'; var DARKGREY = '#9F9F9F'; $('body').css('background', BACKGROUND_COLOR); var API_PROXY_BASE = 'https://mturk-api.istrack.in/'; var API_MULTI_ATTRS_URL = API_PROXY_BASE + 'multi-attrs.php?ids='; var REVIEWS_BASE = 'http://turkopticon.ucsd.edu/'; var control_panel_HTML = '<div id="control_panel" style="margin: 0 auto 0 auto;' + 'border-bottom: 1px solid #000000; margin-bottom: 5px; ' + 'background-color: ' + BACKGROUND_COLOR + ';"></div>'; $('body > :not(#control_panel)').hide(); //hide all nodes directly under the body $('body').prepend(control_panel_HTML); var control_panel = document.getElementById("control_panel"); var big_red_button = document.createElement("BUTTON"); var progress_report = document.createTextNode("Stopped"); var text_area = document.createElement("TABLE"); big_red_button.textContent = "Show Interface"; big_red_button.onclick = function(){show_interface();}; control_panel.appendChild(big_red_button); show_interface(); var global_run = false; var statusdetail_loop_finished = false; var date_header = ""; var history = {}; var wait_loop; function set_progress_report(text, force) { if (global_run == true || force == true) { progress_report.textContent = text; } } function get_progress_report() { return progress_report.textContent; } function wait_until_stopped() { if (global_run == true) { if (statusdetail_loop_finished == true) { big_red_button.textContent = "Start"; set_progress_report("Finished", false); } else { setTimeout(function(){wait_until_stopped();}, 500); } } } function display_wait_time(wait_time) { if (global_run == true) { var current_progress = get_progress_report(); if (current_progress.indexOf("Searching again in")!==-1) { set_progress_report(current_progress.replace(/Searching again in \d+ seconds/ , "Searching again in " + wait_time + " seconds"),false); } else set_progress_report(current_progress + " Searching again in " + wait_time + " seconds.", false); if (wait_time>1) setTimeout(function(){display_wait_time(wait_time-1);}, 1000); } } function dispArr(ar) { var disp = ""; for (var z = 0; z < ar.length; z++) { disp += "id " + z + " is " + ar[z] + " "; } console.log(disp); } function scrape($src) { var $requester = $src.find('a[href^="/mturk/searchbar?selectedSearchType=hitgroups&requester"]'); var $title = $src.find('a[class="capsulelink"]'); var $reward = $src.find('span[class="reward"]'); var $preview = $src.find('a[href^="/mturk/preview?"]'); var $qualified = $src.find('a[href^="/mturk/notqualified?"]'); var not_qualified_group_IDs=[]; $qualified.each(function(){ var groupy = $(this).attr('href'); groupy = groupy.replace("/mturk/notqualified?hitId=",""); not_qualified_group_IDs.push(groupy); }); var $mixed = $src.find('a[href^="/mturk/preview?"],a[href^="/mturk/notqualified?"]'); var listy =[]; $mixed.each(function(){ var groupy = $(this).attr('href'); groupy = groupy.replace("/mturk/notqualified?hitId=",""); groupy = groupy.replace("/mturk/preview?groupId=",""); listy.push(groupy); }); listy = listy.filter(function(elem, pos) { return listy.indexOf(elem) == pos; }); for (var j = 0; j < $requester.length; j++) { var $hits = $requester.eq(j).parent().parent().parent().parent().parent().parent().find('td[class="capsule_field_text"]'); var requester_name = $requester.eq(j).text().trim(); var requester_link = $requester.eq(j).attr('href'); var group_ID=listy[j]; var preview_link = "/mturk/preview?groupId=" + group_ID; var title = $title.eq(j).text().trim(); var reward = $reward.eq(j).text().trim(); var hits = $hits.eq(4).text().trim(); var requester_id = requester_link.replace('/mturk/searchbar?selectedSearchType=hitgroups&requesterId=',''); var accept_link; accept_link = preview_link.replace('preview','previewandaccept'); key = requester_name+title+reward+group_ID; found_key_list.push(key); if (history[key] == undefined) { history[key] = {requester:"", title:"", reward:"", hits:"", req_link:"", prev_link:"", rid:"", acc_link:"", new_result:"", qualified:"", found_this_time:"", initial_time:"", reqdb:"",titledb:""}; history[key].req_link = requester_link; history[key].prev_link = preview_link; history[key].requester = requester_name; history[key].title = title; history[key].reward = reward; history[key].hits = hits; history[key].rid = requester_id; history[key].acc_link = accept_link; HITStorage.indexedDB.checkRequester(requester_id,key); HITStorage.indexedDB.checkTitle(title,key); if (searched_once) { history[key].initial_time = new Date().getTime();//-1000*(save_new_results_time - SEARCH_REFRESH); history[key].new_result = 0; } else { history[key].initial_time = new Date().getTime()-1000*save_new_results_time; history[key].new_result = 1000*save_new_results_time; } if (not_qualified_group_IDs.indexOf(group_ID)!==-1) history[key].qualified = false; else history[key].qualified = true; history[key].found_this_time = true; } else { history[key].new_result = new Date().getTime() - history[key].initial_time; history[key].found_this_time = true; history[key].hits = hits; } } } function statusdetail_loop(next_URL) { if (global_run == true) { if (next_URL.length != 0) { $.get(next_URL, function(data) { var $src = $(data); var maxpagerate = $src.find('td[class="error_title"]:contains("You have exceeded the maximum allowed page request rate for this website.")'); if (maxpagerate.length == 0) { set_progress_report("Processing page " + next_page, false); scrape($src); $next_URL = $src.find('a[href^="/mturk/viewsearchbar"]:contains("Next")'); next_URL = ($next_URL.length != 0) ? $next_URL.attr("href") : ""; next_page++; if (default_type == 1) { var hmin = MINIMUM_HITS+1; for (j = 0; j < found_key_list.length; j++) { if (history[found_key_list[j]].hits < hmin) { next_URL = ""; next_page = -1; break; } } } else if (next_page > PAGES_TO_SCRAPE && correct_for_skips) { var skipped_hits = 0; var added_pages = 0; for (j = 0; j < found_key_list.length; j++) { var obj = history[found_key_list[j]]; if (! ignore_check(obj.requester,obj.title)) skipped_hits++; } added_pages = Math.floor(skipped_hits/10); if (skipped_hits%10 >6) added_pages++; if (next_page > PAGES_TO_SCRAPE + added_pages) { next_URL = ""; next_page = -1; } } else if (next_page > PAGES_TO_SCRAPE) { next_URL = ""; next_page = -1; } setTimeout(function(){statusdetail_loop(next_URL);}, STATUSDETAIL_DELAY); } else { console.log("MPRE"); setTimeout(function(){statusdetail_loop(next_URL);}, MPRE_DELAY); } }); } else { searched_once = true; var found_hits = found_key_list.length; var shown_hits = 0; var new_hits = 0; var url = API_MULTI_ATTRS_URL; var rids = []; var lastRow = text_area.rows.length - 1; for (i = lastRow; i>0; i--) text_area.deleteRow(i); for (j = 0; j < found_key_list.length; j++) { var obj = history[found_key_list[j]]; if (ignore_check(obj.requester,obj.title) && obj.found_this_time){ ++shown_hits; var col_heads = ["<a href='"+ LINK_BASE+obj.req_link +"' target='_blank'>" + obj.requester + "</a>","<a href='"+ LINK_BASE+obj.prev_link +"' target='_blank'>" + obj.title + "</a>",obj.reward,obj.hits,"TO down","<a href='"+ LINK_BASE+obj.acc_link +"' target='_blank'>Accept</a>"]; var row = text_area.insertRow(text_area.rows.length); url += obj.rid + ','; rids.push(obj.rid); if (check_hitDB) { col_heads.push("R"); col_heads.push("T"); } if (!obj.qualified) { col_heads.push("Not Qualified"); } for (i=0; i<col_heads.length; i++) { var this_cell = row.insertCell(i); row.cells[i].style.fontSize = default_text_size; this_cell.innerHTML = col_heads[i]; if(i>1) this_cell.style.textAlign = 'center'; if (check_hitDB) { if (i==6) { if (obj.reqdb) this_cell.style.backgroundColor = GREEN; else this_cell.style.backgroundColor = RED; } else if (i==7) { if (obj.titledb) this_cell.style.backgroundColor = GREEN; else this_cell.style.backgroundColor = RED; } else if (i==8) this_cell.style.backgroundColor = DARKGREY; } else if (i==6) this_cell.style.backgroundColor = DARKGREY; } if (Object.keys(history).length>0) { if (obj.new_result < 1000*save_new_results_time) { new_hits++; for (i in col_heads) { row.cells[i].style.fontSize = default_text_size + 1; row.cells[i].style.fontWeight = "bold"; } } } } } set_progress_report("Scrape complete. " + shown_hits + " HITs found (" + new_hits + " new results). " + (found_hits - shown_hits) + " HITs ignored.", false); url = url.substring(0,url.length - 1); var success_flag = false; GM_xmlhttpRequest( { method: "GET", url: url, onload: function (results) { rdata = $.parseJSON(results.responseText); for (i = 0; i < rids.length; i++) { text_area.rows[i+1].style.backgroundColor = GREY; if (rdata[rids[i]]) { var pay = rdata[rids[i]].attrs.pay var reviews = rdata[rids[i]].reviews var average = 0; var sum = 0; var divisor = 0; var comm = rdata[rids[i]].attrs.comm; var fair = rdata[rids[i]].attrs.fair; var fast = rdata[rids[i]].attrs.fast; if (comm > 0) { sum += COMM_WEIGHT*comm; divisor += COMM_WEIGHT; } if (pay > 0) { sum += PAY_WEIGHT*pay; divisor += PAY_WEIGHT; } if (fair > 0) { sum += FAIR_WEIGHT*fair; divisor += FAIR_WEIGHT; } if (fast > 0) { sum += FAST_WEIGHT*fast; divisor += FAST_WEIGHT; } if (divisor > 0) { average = sum/divisor; } text_area.rows[i+1].cells[4].innerHTML = "<a href='"+ TO_REQ_URL+rids[i] +"' target='_blank'>" + pay + "</a>"; if (reviews > 4) { if (average > 4.49) text_area.rows[i+1].style.backgroundColor = GREEN; else if (average > 3.49) text_area.rows[i+1].style.backgroundColor = LIGHTGREEN; //else if (average > 2.99) // text_area.rows[i+1].style.backgroundColor = YELLOW; else if (average > 1.99) text_area.rows[i+1].style.backgroundColor = ORANGE; else if (average > 0) text_area.rows[i+1].style.backgroundColor = RED; } } else { text_area.rows[i+1].cells[4].innerHTML = "No data"; } } success_flag = true; } }); if (!success_flag) for (i = 0; i < rids.length; i++) text_area.rows[i+1].style.backgroundColor = GREY; statusdetail_loop_finished = true; if (SEARCH_REFRESH>0) { wait_loop = setTimeout(function(){if (global_run) start_it();}, 1000*SEARCH_REFRESH); display_wait_time(SEARCH_REFRESH); } else { global_run = false; big_red_button.textContent = "Start"; } } } } function ignore_check(r,t){ if (ignore_list.indexOf(r)==-1) { return true; } return false; } function start_running() { if (big_red_button.textContent == "Start") { global_run = true; initial_url = URL_BASE; if (search_input.value.length>0) { initial_url = initial_url.replace("searchWords=", "searchWords=" + search_input.value); } if (time_input.value.replace(/[^0-9]+/g,"") != "") { SEARCH_REFRESH = Number(time_input.value); } if (page_input.value.replace(/[^0-9]+/g,"") != "") { PAGES_TO_SCRAPE = Number(page_input.value); } if (min_input.value.replace(/[^0-9]+/g,"") != "") { MINIMUM_HITS = Number(min_input.value); } if (new_time_display_input.value.replace(/[^0-9]+/g,"") != "") { save_new_results_time = Number(new_time_display_input.value); } if (reward_input.value.replace(/[^0-9]+/g,"") != "") { initial_url += "&minReward=" + reward_input.value; } else { initial_url += "&minReward=0.00"; } if (qual_input.checked) { initial_url += "&qualifiedFor=on" } else { initial_url += "&qualifiedFor=off" } if (masters_input.checked) { initial_url += "&requiresMasterQual=on" } if (sort_input1.checked) { initial_url+= "&sortType=LastUpdatedTime%3A1"; default_type = 0; } else if (sort_input2.checked) { initial_url+= "&sortType=NumHITs%3A1"; default_type = 1; } else if (sort_input3.checked) { initial_url+= "&sortType=Reward%3A1"; default_type = 0; } initial_url+="&pageNumber=1&searchSpec=HITGroupSearch" start_it(); } else { global_run = false; clearTimeout(wait_loop); big_red_button.textContent = "Start"; set_progress_report("Stopped", true); } } function start_it() { statusdetail_loop_finished = false; big_red_button.textContent = "Stop"; found_key_list=[]; var ctime = new Date().getTime() if (ctime - last_clear_time > save_results_time*666) { var last_history=history; history = {}; for (var key in last_history) { if (last_history[key].new_result<save_results_time*1000) { history[key]=last_history[key]; if (last_history[key].found_this_time) { last_history[key].found_this_time = false; if (last_history[key].new_result>save_new_results_time*1000) last_history[key].initial_time = ctime-1000*save_new_results_time; } } } last_clear_time = ctime; } next_page = 1; statusdetail_loop(initial_url); } function show_interface() { control_panel.style.color = BROWN; control_panel.style.fontSize = 14; control_panel.removeChild(big_red_button); control_panel.appendChild(document.createTextNode("Auto-refresh delay: ")); time_input.onkeydown = function(event){if (event.keyCode == 13){start_running();}}; time_input.title = "Enter search refresh delay in seconds\n" + "Enter 0 for no auto-refresh\n" + "Default is 0 (no auto-refresh)"; time_input.size = 3; control_panel.appendChild(time_input); control_panel.appendChild(document.createTextNode(" ")); control_panel.appendChild(document.createTextNode("Pages to scrape: ")); page_input.onkeydown = function(event){if (event.keyCode == 13){start_running();}}; page_input.title = "Enter number of pages to scrape\n" + "Default is 4"; page_input.size = 3; control_panel.appendChild(page_input); control_panel.appendChild(document.createTextNode(" ")); control_panel.appendChild(document.createTextNode("Minimum batch size: ")); min_input.onkeydown = function(event){if (event.keyCode == 13){start_running();}}; min_input.title = "Enter minimum HITs for batch search\n" + "Default is 100"; min_input.size = 3; control_panel.appendChild(min_input); control_panel.appendChild(document.createTextNode(" ")); control_panel.appendChild(document.createTextNode("New HIT highlighting: ")); new_time_display_input.onkeydown = function(event){if (event.keyCode == 13){start_running();}}; new_time_display_input.title = "Enter time (in seconds) to keep new HITs highlighted\n" + "Default is 300 (5 minutes)"; new_time_display_input.size = 6; control_panel.appendChild(new_time_display_input); control_panel.appendChild(document.createElement("P")); control_panel.appendChild(document.createTextNode("Minimum reward: ")); reward_input.size = 6; control_panel.appendChild(reward_input); control_panel.appendChild(document.createTextNode(" ")); control_panel.appendChild(document.createTextNode("Qualified")); control_panel.appendChild(qual_input); control_panel.appendChild(document.createTextNode(" ")); control_panel.appendChild(document.createTextNode("Masters")); control_panel.appendChild(masters_input); control_panel.appendChild(document.createTextNode(" ")); control_panel.appendChild(document.createTextNode("Sort types: ")); control_panel.appendChild(sort_input1); control_panel.appendChild(document.createTextNode("Latest")); control_panel.appendChild(sort_input2); control_panel.appendChild(document.createTextNode("Most Available")); control_panel.appendChild(sort_input3); control_panel.appendChild(document.createTextNode("Amount")); control_panel.appendChild(document.createElement("P")); control_panel.appendChild(search_input); search_input.size = 20; search_input.title = "Enter a search term to include\n" + "Default is blank (no included terms)"; search_input.placeholder="Enter search terms here"; control_panel.appendChild(document.createTextNode(" ")); big_red_button.textContent = "Start"; big_red_button.onclick = function(){start_running();}; control_panel.appendChild(big_red_button); control_panel.appendChild(document.createTextNode(" ")); control_panel.appendChild(progress_report); control_panel.appendChild(document.createElement("P")); text_area.style.fontWeight = 400; text_area.createCaption().innerHTML = "HITs"; var col_heads = ['Requester','Title','Reward','HITs Available','TO pay',"Accept HIT"]; var row = text_area.createTHead().insertRow(0); text_area.caption.style.fontWeight = 800; text_area.caption.style.color = BROWN; if (default_text_size > 10) text_area.cellPadding=Math.min(Math.max(1,Math.floor((default_text_size-10)/2)),5); console.log(text_area.cellPadding); //text_area.cellPadding=2; text_area.caption.style.fontSize = 28; text_area.rows[0].style.fontWeight = 800; text_area.rows[0].style.color = BROWN; for (i=0; i<col_heads.length; i++) { var this_cell = row.insertCell(i); this_cell.innerHTML = col_heads[i]; this_cell.style.fontSize = 14; if (i > 1) this_cell.style.textAlign = 'center'; } control_panel.appendChild(text_area); }