This post is about testing
GoQuery from
PuerkitoBio (tnx for this amazing lib!), a library for parsing HTML, which I intend to use in my webcrawler.
Its syntax is similar to jQuery (
Readme ), so i thought thats cool, lots of people use jQuery and its well documented. Furthermore its much less boilerplate code than the plain
net/html package which comes with Go.
I had some issues to wrap my head around to understand that "
.title.may-blank"
is different
from ".title .may-blank" (note the space). The first searches for an element with
class="title may-blank", and the second one searches for a class="title" with a child class="may-blank".
After scratching my head for several hours i finally decided to look into the jQuery documentation - aaaand Bingo!! (To note is that I hate JavaScript since the 90s and try to avoid it whenever possible)
Here is my test code, parsing posts (from a static html) from reddit:
The test HTML was copied from within the firefox inspector and then formatted by the Tidy2 plugin for Notebook++
package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"io"
"log"
"strings"
"unicode"
)
func ExampleGoQuery() {
// Get an io.Reader with HTML content
io := getHtmlInputReader()
// Create a qoquery document to parse from
doc, err := goquery.NewDocumentFromReader(io)
checkErr(err, "Failed to parse HTML")
fmt.Println("---- Starting to parse ------------------------")
// Find reddit posts = elements with class "thing"
thing := doc.Find(".thing")
for iThing := range thing.Nodes {
// use `single` as a selection of 1 node
singlething := thing.Eq(iThing)
// get the reddit post identifier
reddit_post_id, exists := singlething.Attr("data-fullname")
if exists == true {
// find an element with class title and a child with class may-blank
reddit_post_title := singlething.Find(".title .may-blank").Text()
reddit_post_user := singlething.Find(".author").Text()
// find an element with class comments and may-blank (in the same element, note the space!)
reddit_post_url, _ := singlething.Find(".comments.may-blank").Attr("href")
reddit_post_score := singlething.Find(".score.likes").Text()
reddit_postdate, exists := singlething.Find("time").Attr("datetime")
if exists == true {
// Remove CRLF and unnecessary whitespaces
reddit_post_title = stringMinifier(reddit_post_title)
// Print out the crawled info
fmt.Println("Id = " + reddit_post_id)
fmt.Println("Date = " + reddit_postdate)
fmt.Println("User = " + reddit_post_user)
fmt.Println("Title = " + reddit_post_title)
fmt.Println("Score = " + reddit_post_score)
fmt.Println("Url = " + reddit_post_url)
fmt.Println("-----------------------------------------------")
}
}
}
}
// Removes all unnecessary whitespaces
func stringMinifier(in string) (out string) {
white := false
for _, c := range in {
if unicode.IsSpace(c) {
if !white {
out = out + " "
}
white = true
} else {
out = out + string(c)
white = false
}
}
return
}
// returns an io.Reader with dummy test html
func getHtmlInputReader() io.Reader {
s := `
<html>
<head>
<meta name="generator"
content="HTML Tidy for HTML5 (experimental) for Windows https://github.com/w3c/tidy-html5/tree/c63cc39" />
<title></title>
</head>
<body>
<div class="thing id-t3_34z9xo odd link" onclick="click_thing(this)" data-fullname="t3_34z9xo">
<p class="parent"></p>
<span class="rank">1</span>
<div class="midcol unvoted">
<div class="arrow up login-required" onclick="$(this).vote(r.config.vote_hash, null, event)" role="button"
aria-label="upvote" tabindex="0"></div>
<div class="score dislikes">10</div>
<div class="score unvoted">11</div>
<div class="score likes">12</div>
<div class="arrow down login-required" onclick="$(this).vote(r.config.vote_hash, null, event)" role="button"
aria-label="downvote" tabindex="0"></div>
</div>
<div class="entry unvoted">
<p class="title">
<a class="title may-blank loggedin" href="https://github.com/dariubs/GoBooks" tabindex="1">dariubs/GoBooks: list of paper
and electronic books on Go</a>
<span class="domain">(
<a href="/domain/github.com/">github.com</a>)</span></p>
<p class="tagline">submitted
<time title="Tue May 5 20:13:08 2015 UTC" datetime="2015-05-05T20:13:08+00:00" class="live-timestamp">10 hours ago</time>
<a href="http://www.reddit.com/user/dgryski" class="author may-blank id-t2_3hcmx">dgryski</a></p>
<ul class="flat-list buttons">
<li class="first">
<a href="http://www.reddit.com/r/golang/comments/34z9xo/dariubsgobooks_list_of_paper_and_electronic_books/"
class="comments may-blank">1 comment</a>
</li>
<li class="share">
<span class="share-button toggle" style="">
<a class="option active login-required" href="#" tabindex="100"
onclick="return toggle(this, share, cancelShare)">share</a>
<a class="option" href="#">cancel</a>
</span>
</li>
<li class="link-save-button save-button">
<a href="#">save</a>
</li>
<li>
<form action="/post/hide" method="post" class="state-button hide-button">
<input name="executed" value="hidden" type="hidden" />
<span>
<a href="javascript:void(0)" onclick="change_state(this, 'hide', hide_thing);">hide</a>
</span>
</form>
</li>
<li class="report-button">
<a href="javascript:void(0)" class="action-thing" data-action-form="#report-action-form">report</a>
</li>
</ul>
<div class="expando" style="display: none">
<span class="error">loading...</span>
</div>
</div>
<div class="child"></div>
<div class="clearleft"></div>
</div>
<div class="thing id-t3_359t2l odd link self" onclick="click_thing(this)" data-fullname="t3_359t2l">
<p class="parent"></p>
<span class="rank">1</span>
<div class="midcol unvoted">
<div class="arrow up login-required" onclick="$(this).vote(r.config.vote_hash, null, event)" role="button"
aria-label="upvote" tabindex="0"></div>
<div class="score likes">•</div>
<div class="score unvoted">•</div>
<div class="score dislikes">•</div>
<div class="arrow down login-required" onclick="$(this).vote(r.config.vote_hash, null, event)" role="button"
aria-label="downvote" tabindex="0"></div>
</div>
<div class="entry unvoted">
<p class="title">
<a class="title may-blank loggedin" href="/r/golang/comments/359t2l/bulding_api_services_in_go/" tabindex="1"
rel="nofollow">Bulding API services in Go</a>
<span class="domain">(
<a href="/r/golang/">self.golang</a>)</span></p>
<div class="expando-button collapsed selftext" onclick="expando_child(this)"></div>
<p class="tagline">submitted
<time title="Fri May 8 08:41:00 2015 UTC" datetime="2015-05-08T08:41:00+00:00" class="live-timestamp">a minute ago</time>
by
<a href="http://www.reddit.com/user/jan1024188" class="author may-blank id-t2_8bbqm">jan1024188</a></p>
<ul class="flat-list buttons">
<li class="first">
<a href="http://www.reddit.com/r/golang/comments/359t2l/bulding_api_services_in_go/"
class="comments empty may-blank">comment</a>
</li>
<li class="share">
<span class="share-button toggle" style="">
<a class="option active login-required" href="#" tabindex="100"
onclick="return toggle(this, share, cancelShare)">share</a>
<a class="option" href="#">cancel</a>
</span>
</li>
<li class="link-save-button save-button">
<a href="#">save</a>
</li>
<li>
<form action="/post/hide" method="post" class="state-button hide-button">
<input name="executed" value="hidden" type="hidden" />
<span>
<a href="javascript:void(0)" onclick="change_state(this, 'hide', hide_thing);">hide</a>
</span>
</form>
</li>
<li class="report-button">
<a href="javascript:void(0)" class="action-thing" data-action-form="#report-action-form">report</a>
</li>
</ul>
<div class="expando" style="display: none">
<span class="error">loading...</span>
</div>
</div>
<div class="child"></div>
<div class="clearleft"></div>
</div>
</body>
</html>
`
return strings.NewReader(s)
}
func checkErr(err error, msg string) {
if err != nil {
log.Fatalln(msg, err)
}
}
func main() {
ExampleGoQuery()
}
Every day im falling more into love with Go!