Hello!
J'ai réécrit le script pour faire un trombi en Go
ça donne ça
package main
import (
"fmt"
"io"
"io/ioutil"
"log"
"net/http"
"os"
"os/exec"
"regexp"
)
var (
url = "https://debian-facile.org/img/avatars"
outputFolder = "images-go"
)
func download(content []byte) (string, error) {
nameByte, err := retriveImageName(content)
imageName := string(nameByte)
if err != nil {
return "", fmt.Errorf("%s", err.Error())
}
file, err := os.Create(fmt.Sprintf("%s/%s", outputFolder, imageName))
if err != nil {
return "", fmt.Errorf("Unable to create file: %s", err.Error())
}
defer file.Close()
endPoint := fmt.Sprintf("%s/%s", url, imageName)
resp, err := http.Get(endPoint)
if err != nil {
return "", fmt.Errorf("Unable to get image: %s", err.Error())
}
defer resp.Body.Close()
_, err = io.Copy(file, resp.Body)
if err != nil {
return "", fmt.Errorf("Unable to write downloaded content into filesystem: %s", err.Error())
}
return fmt.Sprintf("%s downloaded successfully from %s", imageName, endPoint), nil
}
func retriveImageName(b []byte) (string, error) {
imgRegexp, err := regexp.Compile(`\"(.*?)\"`)
if err != nil {
return "", err
}
found := imgRegexp.Find(b)
ok, err := regexp.Match(`.png|.gif|.jpg|jpeg`, found)
if err != nil || !ok {
return "", fmt.Errorf("%s is not a valid image source", string(found))
}
name := string(found)
name = name[:len(name)-1]
name = name[1:]
return name, nil
}
func crawl(body []byte) {
hrefRegexp, err := regexp.Compile(`href="(.*?)\"`)
if err != nil {
log.Printf("%s\n", err.Error())
}
hrefBytes := hrefRegexp.FindAll(body, -1)
ch := make(chan string)
defer close(ch)
for _, href := range hrefBytes {
go func(h []byte) {
downloaded, err := download(h)
if err != nil {
ch <- err.Error()
} else {
ch <- downloaded
}
}(href)
}
for i := 0; i < len(hrefBytes); i++ {
select {
case image := <-ch:
log.Println(image)
}
}
}
func main() {
res, err := http.Get(url)
if err != nil {
panic(fmt.Sprintf("Unable to fetch debian facile page: %s", err.Error()))
}
defer res.Body.Close()
bodyBytes, err := ioutil.ReadAll(res.Body)
if err != nil {
panic(fmt.Sprintf("Unable to read request body: %s", err.Error()))
}
crawl(bodyBytes)
cmd := exec.Command("montage", outputFolder+"/*.jpg", outputFolder+"/*png", outputFolder+"/-title", "trombi", "trombi-go.png")
err = cmd.Run()
if err != nil {
log.Fatal(err)
}
}
Ça pourrait être mieux mais bon je considère ce code comme une première version.
Niveau benchmark, comparé au script bash ça donne ça:
./trombi (go) 6,10s user 0,53s system 96% cpu 6,867 total
./trombi.sh (bash) 36,34s user 11,76s system 37% cpu 2:08,02 total
Bref, je vais améliorer certaines choses histoires de gagner un peu de temps
EDIT
Réécriture du code:
main.go
package main
import (
"gitlab.com/df-trombi/crawler"
)
var (
url = "https://debian-facile.org/img/avatars"
outputFolder = "images-go"
)
func main() {
crawler := crawler.New().SetURL(url).SetOutputDirectory("images-go").SetFormats(".png", ".jpg", ".jpeg", ".gif")
crawler.Crawl()
}
helper/helper.go
package helper
import (
"io/ioutil"
"net/http"
)
// GetResponseBody open, get and close request body
func GetResponseBody
(response
*http.Response) ([]byte, error
) {
bodyReader
:= response
.Body
defer response
.Body
.Close
()
bodyBytes
, err
:= ioutil
.ReadAll
(bodyReader
)
if err
!= nil {
return nil, err
}
return bodyBytes
, nil
}
crawler/crawler.go
package crawler
import (
"fmt"
"io"
"log"
"net/http"
"os"
"os/exec"
"regexp"
"gitlab.com/df-trombi/helper"
)
func (c
*Crawler
) download
(content
[]byte) (string, error
) {
nameByte
, err
:= c
.retriveImageName
(content
)
imageName
:= string(nameByte
)
if err
!= nil {
return "", fmt
.Errorf("%s", err
.Error())
}
file
, err
:= os
.Create(fmt
.Sprintf("%s/%s", c
.outputDirectory
, imageName
))
if err
!= nil {
return "", fmt
.Errorf("Unable to create file: %s", err
.Error
())
}
defer file
.Close
()
endPoint
:= fmt
.Sprintf
("%s/%s", c
.url
, imageName
)
resp
, err
:= http
.Get
(endPoint
)
if err
!= nil {
return "", fmt
.Errorf("Unable to get image: %s", err
.Error
())
}
defer resp
.Body
.Close
()
_
, err
= io
.Copy
(file
, resp
.Body
)
if err
!= nil {
return "", fmt
.Errorf("Unable to write downloaded content into filesystem: %s", err
.Error
())
}
return fmt
.Sprintf
("%s downloaded successfully from %s", imageName
, endPoint
), nil
}
func (c
*Crawler
) retriveImageName
(b
[]byte) (string, error
) {
imgRegexp
, err
:= regexp
.Compile
(`\"(.*?)\"`)
if err
!= nil {
return "", err
}
found
:= imgRegexp
.Find(b
)
ok
, err
:= regexp
.Match(`.png|.gif|.jpg|jpeg`, found
)
if err
!= nil || !ok
{
return "", fmt
.Errorf("%s is not a valid image source", string(found
))
}
name
:= string(found
)
name
= name
[:len(name
)-1]
name
= name
[1:]
return name
, nil
}
func (c
*Crawler
) crawl
(body
[]byte) {
hrefRegexp
, err
:= regexp
.Compile
(`href="(.*?)\"`)
if err
!= nil {
log
.Printf
("%s\n", err
.Error
())
}
hrefBytes
:= hrefRegexp
.FindAll
(body
, -1)
ch
:= make(chan string)
defer close(ch
)
for _
, href
:= range hrefBytes
{
go func(h
[]byte) {
downloaded
, err
:= c
.download
(h
)
if err
!= nil {
ch <
- err
.Error
()
} else {
ch <
- downloaded
}
}(href
)
}
for i := 0; i <
len(hrefBytes
); i++ {
select {
case image
:= <
-ch
:
log
.Println
(image
)
}
}
}
// Crawler struct
type Crawler
struct {
url
string
outputDirectory
string
formats
[]string
}
// SetFormats fluent format setter for Crawler
func (c
*Crawler
) SetFormats
(formats
...string) *Crawler
{
c
.formats
= formats
return c
}
// GetFormats formats getter for Crawler
func (c
*Crawler
) GetFormats
() []string {
return c
.formats
}
// SetURL fluent url setter for Crawler
func (c
*Crawler
) SetURL
(url
string) *Crawler
{
c
.url
= url
return c
}
// SetOutputDirectory fluent outputDirectory setter for Crawler
func (c
*Crawler
) SetOutputDirectory
(directory
string) *Crawler
{
c
.outputDirectory
= directory
return c
}
// GetOutputDirectory outputDirectory getter for Crawler
func (c
*Crawler
) GetOutputDirectory
() string {
return c
.outputDirectory
}
// GetURL url getter for Crawler
func (c
*Crawler
) GetURL
() string {
return c
.url
}
// New get new Crawler
func New
() *Crawler
{
return &Crawler
{}
}
func (c
*Crawler
) buildOutputFilePath
() []string {
var res
[]string
for _
, format
:= range c
.formats
{
res
= append
(res
, fmt
.Sprintf
("%s/*%s", c
.outputDirectory
, format
))
}
return res
}
func (c
*Crawler
) buildCommand
() *exec.Cmd {
arg
:= c
.buildOutputFilePath
()
arg
= append
(arg
, "-title")
arg
= append
(arg
, "trombi")
arg
= append
(arg
, "trombi-go.png")
return exec
.Command
("montage", arg
...)
}
// Crawl url
func (c
*Crawler
) Crawl
() {
// get pointeur of Response object
res
, err
:= http
.Get
(c
.url
)
if err
!= nil {
panic(fmt
.Sprintf
("Unable to fetch debian facile page: %s", err
.Error()))
}
// open, get and close response body
bodyBytes
, err
:= helper
.GetResponseBody
(res
)
if err
!= nil {
panic(fmt
.Sprintf
("Unable to read request body: %s", err
.Error
()))
}
c
.crawl
(bodyBytes
)
cmd
:= c
.buildCommand
()
err
= cmd
.Run
()
if err
!= nil {
log
.Fatal
(err
)
}
}
C'est un peu plus propre
PS: ouais je sais que l'utilisation de getter/setter en go n'est pas vraiment une bonne pratique, on va dire que j'ai mes réflexes de dev java..
Dernière modification par Maximilien LIX (08-04-2019 21:24:03)