c := colly.NewCollector( colly.UserAgent("myUserAgent"), colly.AllowedDomains("foo.com", "bar.com"), ) // Custom User-Agent and allowed domains are cloned to c2 c2 := c.Clone()
funcmain() { // Instantiate default collector c := colly.NewCollector( // Visit only domains: hackerspaces.org, wiki.hackerspaces.org colly.AllowedDomains("hackerspaces.org", "wiki.hackerspaces.org"), )
// On every a element which has href attribute call callback c.OnHTML("a[href]", func(e *colly.HTMLElement) { link := e.Attr("href") // Print link fmt.Printf("Link found: %q -> %s\n", e.Text, link) // Visit link found on page // Only those links are visited which are in AllowedDomains c.Visit(e.Request.AbsoluteURL(link)) })
// Before making a request print "Visiting ..." c.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL.String()) })
// Start scraping on https://hackerspaces.org c.Visit("https://hackerspaces.org/") }
funcmain() { // Instantiate default collector c := colly.NewCollector( // MaxDepth is 1, so only the links on the scraped page // is visited, and no further links are followed colly.MaxDepth(1), )
// On every a element which has href attribute call callback c.OnHTML("a[href]", func(e *colly.HTMLElement) { link := e.Attr("href") // Print link fmt.Println(link) // Visit link found on page e.Request.Visit(link) })
// Start scraping on https://en.wikipedia.org c.Visit("https://en.wikipedia.org/") }
funcsetupServer() { var handler http.HandlerFunc = func(w http.ResponseWriter, r *http.Request) { fmt.Println("received request") err := r.ParseMultipartForm(10000000) if err != nil { fmt.Println("server: Error") w.WriteHeader(500) w.Write([]byte("<html><body>Internal Server Error</body></html>")) return } w.WriteHeader(200) fmt.Println("server: OK") w.Write([]byte("<html><body>Success</body></html>")) }
go http.ListenAndServe(":8080", handler) }
funcmain() { // Start a single route http server to post an image to. setupServer()
c := colly.NewCollector(colly.AllowURLRevisit(), colly.MaxDepth(5))
// On every a element which has href attribute call callback c.OnHTML("html", func(e *colly.HTMLElement) { fmt.Println(e.Text) time.Sleep(1 * time.Second) e.Request.PostMultipart("http://localhost:8080/", generateFormData()) })
// Before making a request print "Visiting ..." c.OnRequest(func(r *colly.Request) { fmt.Println("Posting gocolly.jpg to", r.URL.String()) })
funcmain() { // Instantiate default collector c := colly.NewCollector( // MaxDepth is 2, so only the links on the scraped page // and links on those pages are visited colly.MaxDepth(2), colly.Async(true), )
// Limit the maximum parallelism to 2 // This is necessary if the goroutines are dynamically // created to control the limit of simultaneous requests. // // Parallelism can be controlled also by spawning fixed // number of go routines. c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 2})
// On every a element which has href attribute call callback c.OnHTML("a[href]", func(e *colly.HTMLElement) { link := e.Attr("href") // Print link fmt.Println(link) // Visit link found on page on a new thread e.Request.Visit(link) })
// Start scraping on https://en.wikipedia.org c.Visit("https://en.wikipedia.org/") // Wait until threads are finished c.Wait() }
// Instantiate default collector c := colly.NewCollector()
// create a request queue with 2 consumer threads q, _ := queue.New( 2, // Number of consumer threads &queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage )
// Instantiate default collector c := colly.NewCollector( // Attach a debugger to the collector colly.Debugger(&debug.LogDebugger{}), colly.Async(true), )
// Limit the number of threads started by colly to two // when visiting links which domains' matches "*httpbin.*" glob c.Limit(&colly.LimitRule{ DomainGlob: "*httpbin.*", Parallelism: 2, RandomDelay: 5 * time.Second, })
// Start scraping in four threads on https://httpbin.org/delay/2 for i := 0; i < 4; i++ { c.Visit(fmt.Sprintf("%s?n=%d", url, i)) } // Start scraping on https://httpbin.org/delay/2 c.Visit(url) // Wait until threads are finished c.Wait() }
// Instantiate default collector c := colly.NewCollector( // Turn on asynchronous requests colly.Async(true), // Attach a debugger to the collector colly.Debugger(&debug.LogDebugger{}), )
// Limit the number of threads started by colly to two // when visiting links which domains' matches "*httpbin.*" glob c.Limit(&colly.LimitRule{ DomainGlob: "*httpbin.*", Parallelism: 2, //Delay: 5 * time.Second, })
// Start scraping in five threads on https://httpbin.org/delay/2 for i := 0; i < 5; i++ { c.Visit(fmt.Sprintf("%s?n=%d", url, i)) } // Wait until threads are finished c.Wait() }
funcmain() { // Instantiate default collector c := colly.NewCollector()
// Before making a request put the URL with // the key of "url" into the context of the request c.OnRequest(func(r *colly.Request) { r.Ctx.Put("url", r.URL.String()) })
// After making a request get "url" from // the context of the request c.OnResponse(func(r *colly.Response) { fmt.Println(r.Ctx.Get("url")) })
// Start scraping on https://en.wikipedia.org c.Visit("https://en.wikipedia.org/") }
funcmain() { // Instantiate default collector c := colly.NewCollector( // Visit only root url and urls which start with "e" or "h" on httpbin.org colly.URLFilters( regexp.MustCompile("http://httpbin\\.org/(|e.+)$"), regexp.MustCompile("http://httpbin\\.org/h.+"), ), )
// On every a element which has href attribute call callback c.OnHTML("a[href]", func(e *colly.HTMLElement) { link := e.Attr("href") // Print link fmt.Printf("Link found: %q -> %s\n", e.Text, link) // Visit link found on page // Only those links are visited which are matched by any of the URLFilter regexps c.Visit(e.Request.AbsoluteURL(link)) })
// Before making a request print "Visiting ..." c.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL.String()) })
// Start scraping on http://httpbin.org c.Visit("http://httpbin.org/") }